In [1]:
!pip install PyPDF2 scikit-learn


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
import numpy as np
import re
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from PyPDF2 import PdfReader
from google.colab import files

In [4]:
def pdf_to_text(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf_reader = PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

In [5]:
def extract_personal_details(resume_text):
    name_pattern = re.compile(r'Name:\s*(.*?)\s', re.IGNORECASE)
    email_pattern = re.compile(r'Email:\s*(.*?)\s', re.IGNORECASE)
    phone_pattern = re.compile(r'Phone:\s*(.*?)\s', re.IGNORECASE)

    name_match = re.search(name_pattern, resume_text)
    email_match = re.search(email_pattern, resume_text)
    phone_match = re.search(phone_pattern, resume_text)

    name = name_match.group(1) if name_match else None
    email = email_match.group(1) if email_match else None
    phone = phone_match.group(1) if phone_match else None

    return name, email, phone



In [6]:
def predict_proficiency_level(resume_text):
    data = [
        ("Beginner", "basic skills"),
        ("Beginner", "entry level"),
        ("Beginner", "novice"),
        ("Intermediate", "some experience"),
        ("Intermediate", "moderate skills"),
        ("Intermediate", "proficient"),
        ("Advanced", "extensive experience"),
        ("Advanced", "highly skilled"),
        ("Advanced", "expert level"),
        ("Advanced", "master")
    ]
    X, y = zip(*data)

    vectorizer = CountVectorizer()
    X_vectorized = vectorizer.fit_transform(y)

    clf = DecisionTreeClassifier()
    clf.fit(X_vectorized, X)

    resume_vectorized = vectorizer.transform([resume_text])
    proficiency_level = clf.predict(resume_vectorized)[0]

    return proficiency_level

In [7]:
def load_keywords():
    uploaded = files.upload()
    for fn in uploaded.keys():
        with open(fn) as f:
            return json.load(f)

In [26]:
from sklearn.metrics import classification_report, confusion_matrix

def predict_domain_with_decision_tree(resume_text, keywords):
    # Create keyword to domain mapping
    keyword_to_domain = {}
    for domain, domain_keywords in keywords.items():
        for keyword in domain_keywords:
            keyword_to_domain[keyword.lower()] = domain

    # Prepare training data
    X_train = []
    y_train = []
    for word in resume_text.split():
        word_lower = word.lower()
        if word_lower in keyword_to_domain:
            X_train.append(word_lower)
            y_train.append(keyword_to_domain[word_lower])

    # If no data for training, return a message
    if not X_train or not y_train:
        print("No valid training data found. Ensure the resume contains relevant keywords.")
        return "Unknown"

    # Train/test split
    X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train_data)
    X_test_vectorized = vectorizer.transform(X_test_data)

    clf = DecisionTreeClassifier()
    clf.fit(X_train_vectorized, y_train_data)

    y_pred = clf.predict(X_test_vectorized)

    accuracy = accuracy_score(y_test_data, y_pred)
    print(f"Domain Prediction Accuracy: {accuracy * 100:.2f}%")

    print("\nClassification Report:")
    print(classification_report(y_test_data, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test_data, y_pred))

    # Convert keyword to domain prediction
    X_train_vectorized_full = vectorizer.transform(X_train)
    predicted_domain_encoded = clf.predict(X_train_vectorized_full)

    # Map numeric predictions to domain names
    domain_label_map = {label: idx for idx, label in enumerate(set(y_train))}
    domain_inverse_map = {idx: label for label, idx in domain_label_map.items()}

    try:
        predicted_domain_labels = [domain_inverse_map[domain_label_map[label]] for label in predicted_domain_encoded]
    except KeyError as e:
        print(f"KeyError: {e}")
        print("Available labels:", domain_label_map)
        raise

    print("Predicted Domain Labels:", predicted_domain_labels)

    # Determine the most frequent predicted domain
    predicted_domain = max(set(predicted_domain_labels), key=predicted_domain_labels.count)

    return predicted_domain


In [27]:
def main():
    print("Upload your resume in PDF format.")
    uploaded = files.upload()

    pdf_path = list(uploaded.keys())[0]
    resume_text = pdf_to_text(pdf_path)

    name_from_resume, email_from_resume, phone_from_resume = extract_personal_details(resume_text)

    if not name_from_resume:
        name_from_resume = input("Enter your Name: ")
    if not email_from_resume:
        email_from_resume = input("Enter your Email: ")
    if not phone_from_resume:
        phone_from_resume = input("Enter your Phone Number: ")

    proficiency_level = predict_proficiency_level(resume_text)

    print("Please upload the keywords JSON file.")
    keywords = load_keywords()

    domain = predict_domain_with_decision_tree(resume_text, keywords)

    print("\n--- Resume Analysis Results ---")
    print(f"Name: {name_from_resume}")
    print(f"Email: {email_from_resume}")
    print(f"Phone: {phone_from_resume}")
    print(f"Predicted Proficiency Level: {proficiency_level}")
    print(f"Predicted Domain: {domain}")

In [29]:
if __name__ == "__main__":
    main()

Upload your resume in PDF format.


Saving data science and machine learning.pdf to data science and machine learning.pdf
Enter your Name: payal
Enter your Email: payal@gmail.com
Enter your Phone Number: 1123456789
Please upload the keywords JSON file.


Saving keywords.json to keywords (6).json
Domain Prediction Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

data_science       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1


Confusion Matrix:
[[1]]
Predicted Domain Labels: ['data_science', 'data_science', 'data_science', 'data_science', 'machine_learning']

--- Resume Analysis Results ---
Name: payal
Email: payal@gmail.com
Phone: 1123456789
Predicted Proficiency Level: Intermediate
Predicted Domain: data_science
