In [30]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

In [31]:
file_path = 'Balanced_Resumes.csv'
df = pd.read_csv(file_path)

In [32]:
df_cleaned = df.dropna(subset=['resume', 'job_desc','label'])

In [33]:
df=df_cleaned

In [34]:
def calculate_similarity_score(row):
    vectorizer = TfidfVectorizer(stop_words='english')
    resume_vec = vectorizer.fit_transform([row['resume']])
    job_desc_vec = vectorizer.transform([row['job_desc']])
    return cosine_similarity(resume_vec, job_desc_vec)[0][0]

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

df['similarity_score'] = df.apply(lambda row: calculate_similarity_score(row) * 100, axis=1)

In [39]:
def convert_labels(label):
    if label.lower() == 'fit':
        return 2
    elif label.lower() == 'medium fit':
        return 1
    else:
        return 0

# Apply the function to the 'label' column
df['label'] = df['label'].apply(convert_labels)


In [40]:
# Create features and labels
X = df[['resume', 'job_desc', 'similarity_score']]
y = df['label']

In [43]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# Define the pipeline
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english'))
])

In [45]:
preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'resume'),
    ('similarity', StandardScaler(), ['similarity_score'])
])

In [46]:
rf_model = RandomForestClassifier(random_state=42)

In [47]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_model)
])

In [48]:
# Set hyperparameters for grid search
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [49]:
# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [50]:
# Get the best model
best_model = grid_search.best_estimator_

In [51]:
y_pred = best_model.predict(X_test)
print("Best Hyperparameters:", grid_search.best_params_)
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Hyperparameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Accuracy on Test Set: 0.8691384950926936
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.88       295
           1       0.86      0.85      0.85       291
           2       0.89      0.85      0.87       331

    accuracy                           0.87       917
   macro avg       0.87      0.87      0.87       917
weighted avg       0.87      0.87      0.87       917



In [52]:
import joblib
joblib.dump(best_model, 'Random_Forest.pkl')
print("Model saved successfully!")

Model saved successfully!


In [53]:
df.head()

Unnamed: 0,resume,job_desc,label,similarity_score
480,"Name: Emily Davis, Contact: alice.smith@exampl...",Looking for a backend developer with 3+ years ...,2,38.348249
481,"Name: John Doe, Contact: john.doe@example.com,...",Seeking a machine learning engineer with exper...,2,30.618622
482,"Name: Michael Johnson, Contact: emily.davis@ex...",Looking for a backend developer with 3+ years ...,2,25.819889
483,"Name: Alice Smith, Contact: alice.smith@exampl...",Seeking a machine learning engineer with exper...,2,14.285714
484,"Name: John Doe, Contact: emily.davis@example.c...",Seeking a machine learning engineer with exper...,2,40.509575


In [54]:
df.tail()

Unnamed: 0,resume,job_desc,label,similarity_score
5995,"Name: Alice Smith, Contact: john.doe@example.c...",Full-stack web developer required with 4+ year...,1,34.188173
5996,"Name: Michael Johnson, Contact: michael.johnso...",Looking for a backend developer with 3+ years ...,1,28.571429
5997,"Name: John Doe, Contact: john.doe@example.com,...",Looking for a backend developer with 3+ years ...,1,26.413527
5998,"Name: Emily Davis, Contact: michael.johnson@ex...",Hiring a data analyst with 2-4 years of experi...,1,43.386092
5999,"Name: John Doe, Contact: john.doe@example.com,...",Seeking a machine learning engineer with exper...,1,41.039134


In [58]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Example data
resume_text = "Experienced MERN Stack Developer with expertise in MongoDB, Express.js, React.js, and Node.js, 
skilled in designing, developing, and deploying full-stack web applications. Proficient in JavaScript, API integration, 
and optimizing the performance of web applications."

job_desc_text = "Looking for a MERN Stack Developer to build and maintain dynamic web applications using MongoDB, 
Express.js, React.js, and Node.js. Must be proficient in JavaScript, with experience in developing scalable and responsive applications."

# Load the trained model
model = joblib.load('Random_Forest.pkl')

# Calculate the similarity score
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform([resume_text, job_desc_text])
similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0] * 100

# Prepare the combined text for prediction
combined_text = resume_text + " " + job_desc_text

# Make prediction
predicted_class = model.predict([combined_text])[0]

# Map prediction to label
label_map = {2: 'Fit', 1: 'Medium Fit', 0: 'Not Fit'}
fit_status = label_map[predicted_class]

(similarity_score, fit_status)


ValueError: Expected 2D array, got 1D array instead:
array=['Experienced MERN Stack Developer with expertise in MongoDB, Express.js, React.js, and Node.js, \nskilled in designing, developing, and deploying full-stack web applications. Proficient in JavaScript, API integration, \nand optimizing the performance of web applications. Looking for a MERN Stack Developer to build and maintain dynamic web applications using MongoDB, \nExpress.js, React.js, and Node.js. Must be proficient in JavaScript, with experience in developing scalable and responsive applications.'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.