In [3]:
import pandas as pd
import numpy as np
import re
import sys

import nltk
nltk.download('stopwords')


from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data = {
    "resume": [
        "Python machine learning deep learning pytorch cnn data analysis",
        "React node mongodb express mern stack developer",
        "Data analyst python pandas sql visualization power bi",
        "Java spring boot backend microservices api",
        "Machine learning nlp tensorflow scikit-learn"
    ],
    
    "job_desc": [
        "Looking for machine learning engineer with python pytorch",
        "Full stack mern developer required",
        "Data analyst with sql and visualization skills",
        "Backend java developer",
        "AI NLP engineer"
    ],
    
    "label": [1, 0, 1, 0, 1]  # suitable or not
}

df = pd.DataFrame(data)
df


Unnamed: 0,resume,job_desc,label
0,Python machine learning deep learning pytorch ...,Looking for machine learning engineer with pyt...,1
1,React node mongodb express mern stack developer,Full stack mern developer required,0
2,Data analyst python pandas sql visualization p...,Data analyst with sql and visualization skills,1
3,Java spring boot backend microservices api,Backend java developer,0
4,Machine learning nlp tensorflow scikit-learn,AI NLP engineer,1


In [5]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df["resume_clean"] = df["resume"].apply(clean_text)
df["job_clean"] = df["job_desc"].apply(clean_text)

df


Unnamed: 0,resume,job_desc,label,resume_clean,job_clean
0,Python machine learning deep learning pytorch ...,Looking for machine learning engineer with pyt...,1,python machine learning deep learning pytorch ...,looking machine learning engineer python pytorch
1,React node mongodb express mern stack developer,Full stack mern developer required,0,react node mongodb express mern stack developer,full stack mern developer required
2,Data analyst python pandas sql visualization p...,Data analyst with sql and visualization skills,1,data analyst python pandas sql visualization p...,data analyst sql visualization skills
3,Java spring boot backend microservices api,Backend java developer,0,java spring boot backend microservices api,backend java developer
4,Machine learning nlp tensorflow scikit-learn,AI NLP engineer,1,machine learning nlp tensorflow scikitlearn,ai nlp engineer


In [6]:
vectorizer = TfidfVectorizer()

resume_vectors = vectorizer.fit_transform(df["resume_clean"])
job_vectors = vectorizer.transform(df["job_clean"])


In [7]:
similarities = []

for i in range(len(df)):
    sim = cosine_similarity(resume_vectors[i], job_vectors[i])[0][0]
    similarities.append(sim)

df["similarity_score"] = similarities
df


Unnamed: 0,resume,job_desc,label,resume_clean,job_clean,similarity_score
0,Python machine learning deep learning pytorch ...,Looking for machine learning engineer with pyt...,1,python machine learning deep learning pytorch ...,looking machine learning engineer python pytorch,0.716944
1,React node mongodb express mern stack developer,Full stack mern developer required,0,react node mongodb express mern stack developer,full stack mern developer required,0.654654
2,Data analyst python pandas sql visualization p...,Data analyst with sql and visualization skills,1,data analyst python pandas sql visualization p...,data analyst sql visualization skills,0.707107
3,Java spring boot backend microservices api,Backend java developer,0,java spring boot backend microservices api,backend java developer,0.471405
4,Machine learning nlp tensorflow scikit-learn,AI NLP engineer,1,machine learning nlp tensorflow scikitlearn,ai nlp engineer,0.48214


In [8]:
X = resume_vectors
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2


Confusion Matrix:
 [[0 1]
 [0 1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
def rank_resumes(resumes, job_description):
    resumes_clean = [clean_text(r) for r in resumes]
    job_clean = clean_text(job_description)
    
    vec = TfidfVectorizer()
    resume_vec = vec.fit_transform(resumes_clean)
    job_vec = vec.transform([job_clean])
    
    scores = cosine_similarity(resume_vec, job_vec).flatten()
    
    ranking = sorted(zip(resumes, scores), key=lambda x: x[1], reverse=True)
    
    return ranking


In [11]:
new_resumes = [
    "Python deep learning pytorch computer vision",
    "MERN stack frontend backend developer",
    "SQL data analyst power bi pandas"
]

job = "Looking for AI engineer with python and deep learning"

rank_resumes(new_resumes, job)


[('Python deep learning pytorch computer vision',
  np.float64(0.7071067811865476)),
 ('MERN stack frontend backend developer', np.float64(0.0)),
 ('SQL data analyst power bi pandas', np.float64(0.0))]

In [12]:
import joblib

joblib.dump(model, "resume_screening_model.pkl")
print("Model saved successfully")


Model saved successfully
