In [2]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('UpdatedResumeDataSet.csv')

# Function to clean resume text
def clean_resume(text):
    text = re.sub('http\S+\s*', ' ', text)  # Remove URLs
    text = re.sub('RT|cc', ' ', text)  # Remove RT and cc
    text = re.sub('#\S+', '', text)  # Remove hashtags
    text = re.sub('@\S+', '  ', text)  # Remove mentions
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)  # Remove punctuations
    text = re.sub(r'[^\x00-\x7f]',r' ', text)  # Remove non-ASCII characters
    text = re.sub('\s+', ' ', text)  # Remove extra whitespace
    return text

# Apply cleaning function
df['cleaned_resume'] = df['Resume'].apply(lambda x: clean_resume(x))

# Encode target labels
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=1500)
X = tfidf.fit_transform(df['cleaned_resume'])
y = df['Category_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      1.00      1.00        12
                   Hadoop       1.00      1.00      1.00     

In [4]:
import numpy as np

# Function to get top keywords for each category
def get_top_keywords(category, n=10):
    category_indices = df[df['Category'] == category].index
    category_tfidf = X[category_indices]
    mean_tfidf = np.mean(category_tfidf.toarray(), axis=0)
    top_n_indices = mean_tfidf.argsort()[::-1][:n]
    top_keywords = [tfidf.get_feature_names_out()[i] for i in top_n_indices]
    return top_keywords

# Example usage
predicted_category = le.inverse_transform([y_pred[0]])[0]
print(f"Predicted Category: {predicted_category}")
print("Suggested Keywords to Include:")
print(get_top_keywords(predicted_category))


Predicted Category: Java Developer
Suggested Keywords to Include:
['java', 'developer', 'exprience', 'months', 'jsp', 'ajax', 'spring', 'j2ee', 'hibernate', 'servlet']


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute similarity
def compute_similarity(resume_text, job_description):
    resume_vec = tfidf.transform([resume_text])
    job_desc_vec = tfidf.transform([job_description])
    similarity = cosine_similarity(resume_vec, job_desc_vec)
    return similarity[0][0]

# Example usage
resume_text = df['cleaned_resume'].iloc[0]
job_description = "Looking for a Data Scientist with experience in Python, Machine Learning, and Data Analysis."
similarity_score = compute_similarity(resume_text, job_description)
print(f"Similarity Score: {similarity_score}")


Similarity Score: 0.2893987855122808


In [6]:
# Example: Choose a specific resume (say, index 10)
index = 50
sample_resume_raw = df['Resume'].iloc[index]
sample_resume_cleaned = df['cleaned_resume'].iloc[index]

# Transform using the same TF-IDF vectorizer
sample_vector = tfidf.transform([sample_resume_cleaned])

# Predict category
predicted_label = model.predict(sample_vector)[0]
predicted_category = le.inverse_transform([predicted_label])[0]
print("📝 Original Resume:\n", sample_resume_raw[:500], "...")  # Print a portion of the resume
print("\n🔍 Predicted Category:", predicted_category)

# Enhancement Suggestions
def suggest_missing_keywords(cleaned_resume, predicted_category):
    resume_words = set(cleaned_resume.lower().split())
    top_keywords = get_top_keywords(predicted_category, n=15)
    missing_keywords = [kw for kw in top_keywords if kw not in resume_words]
    return missing_keywords

print("\n✨ Suggested Keywords to Add:")
print(suggest_missing_keywords(sample_resume_cleaned, predicted_category))

# Optional: Compare against a sample job description
sample_job_desc = """We are looking for a skilled Machine Learning Engineer to help us build and optimize data-driven solutions. 
The candidate should have strong experience with Python, machine learning libraries (Scikit-learn, TensorFlow, PyTorch), 
and working with large datasets. Familiarity with model deployment, data preprocessing, and cloud platforms (AWS, GCP) is a plus."""

similarity_score = compute_similarity(sample_resume_cleaned, sample_job_desc)
print("\n📈 Resume-Job Description Similarity Score:", round(similarity_score, 2))


📝 Original Resume:
 SOFTWARE SKILLS: â¢ General Computer Proficiency â¢ Program Langages known C, C+, Java, Web Programming â¢ Tools & Software know MATLAB. DBMS KEY STRENGTHS: â¢ Posse's Good communication and analytic skills. â¢ Positive thinking. Sincere, Hard work, Honesty, Responsibility. â¢ Enthusiastic to learn new skills & take up new tasks. â¢ Self - motivated. â¢ Ready to accept challenges Education Details 
January 2014 to January 2017 BE in computer science and engineering computer science engi ...

🔍 Predicted Category: HR

✨ Suggested Keywords to Add:
['mba', 'chennai', 'june', 'finance', '2012', 'college', 'university']

📈 Resume-Job Description Similarity Score: 0.03
