In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

In [2]:
# Load the dataset
file_path = 'Balanced_Resumes.csv'
df = pd.read_csv(file_path)

In [3]:
df.isna().sum()

resume      1415
job_desc    1415
label       1415
dtype: int64

In [4]:
df.head()

Unnamed: 0,resume,job_desc,label
0,,,
1,,,
2,,,
3,,,
4,,,


In [5]:
df_cleaned = df.dropna(subset=['resume', 'job_desc','label'])


In [6]:
df_cleaned.isna().sum()

resume      0
job_desc    0
label       0
dtype: int64

In [7]:
df_cleaned.head()

Unnamed: 0,resume,job_desc,label
480,"Name: Emily Davis, Contact: alice.smith@exampl...",Looking for a backend developer with 3+ years ...,Fit
481,"Name: John Doe, Contact: john.doe@example.com,...",Seeking a machine learning engineer with exper...,Fit
482,"Name: Michael Johnson, Contact: emily.davis@ex...",Looking for a backend developer with 3+ years ...,Fit
483,"Name: Alice Smith, Contact: alice.smith@exampl...",Seeking a machine learning engineer with exper...,Fit
484,"Name: John Doe, Contact: emily.davis@example.c...",Seeking a machine learning engineer with exper...,Fit


In [8]:
df=df_cleaned

In [9]:
df.head()

Unnamed: 0,resume,job_desc,label
480,"Name: Emily Davis, Contact: alice.smith@exampl...",Looking for a backend developer with 3+ years ...,Fit
481,"Name: John Doe, Contact: john.doe@example.com,...",Seeking a machine learning engineer with exper...,Fit
482,"Name: Michael Johnson, Contact: emily.davis@ex...",Looking for a backend developer with 3+ years ...,Fit
483,"Name: Alice Smith, Contact: alice.smith@exampl...",Seeking a machine learning engineer with exper...,Fit
484,"Name: John Doe, Contact: emily.davis@example.c...",Seeking a machine learning engineer with exper...,Fit


In [10]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

In [11]:
df['text'] = df['resume'] + " " + df['job_desc']

In [12]:
X = df['text']
y = df['label']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [15]:
rf_model = RandomForestClassifier(random_state=42)

In [16]:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(tfidf_vectorizer, rf_model)

In [17]:
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],  # Number of trees
    'randomforestclassifier__max_depth': [10, 20, None],  # Depth of the trees
    'randomforestclassifier__min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],  # Minimum samples required to be at a leaf node
}

In [18]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

In [19]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [20]:
 best_model = grid_search.best_estimator_

In [21]:
y_pred = best_model.predict(X_test)

In [22]:
print("Best Hyperparameters:", grid_search.best_params_)
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Hyperparameters: {'randomforestclassifier__max_depth': None, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__n_estimators': 100}
Accuracy on Test Set: 0.8756815703380589
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.88       331
           1       0.88      0.85      0.87       291
           2       0.84      0.93      0.88       295

    accuracy                           0.88       917
   macro avg       0.88      0.88      0.88       917
weighted avg       0.88      0.88      0.88       917



In [23]:
import joblib

# Save the best model (the model with the best hyperparameters found by GridSearchCV)
joblib.dump(best_model, 'random_forest_model.pkl')

print("Model saved successfully!")


Model saved successfully!


In [24]:
import joblib

# Load the trained model from the file
loaded_model = joblib.load('random_forest_model.pkl')

# New mixed test examples with labels potentially covering all three categories (Fit, Not Fit, Medium Fit)
new_resumes = [
    "Name: Laura Brooks, Contact: laura.brooks@example.com, Experience: Full-stack Developer, Skills: JavaScript, Node.js, React",
    "Name: Mark Green, Contact: mark.green@example.com, Experience: Data Scientist, Skills: Python, pandas, scikit-learn",
    "Name: Sara White, Contact: sara.white@example.com, Experience: Backend Developer, Skills: Java, Spring Boot",
    "Name: James Lee, Contact: james.lee@example.com, Experience: Junior Developer, Skills: HTML, CSS",
    "Name: Kate Turner, Contact: kate.turner@example.com, Experience: Frontend Developer, Skills: React, Angular",
    "Experienced MERN Stack Developer with expertise in MongoDB, Express.js, React.js, and Node.js, skilled in designing, developing, and deploying full-stack web applications. Proficient in JavaScript, API integration, and optimizing the performance of web applications"
]

new_job_descs = [
    "Looking for a full-stack developer with expertise in JavaScript, React, and Node.js for building modern web applications.",
    "Seeking a data scientist with experience in Python, machine learning, and data analysis for building predictive models.",
    "Looking for a backend developer with expertise in Java and Spring Boot for building scalable backend systems.",
    "Seeking a junior developer with basic skills in HTML and CSS to support the development team.",
    "Looking for a frontend developer with expertise in React, Angular, and building user-friendly interfaces.",
    "Looking for a MERN Stack Developer to build and maintain dynamic web applications using MongoDB, Express.js, React.js, and Node.js. Must be proficient in JavaScript, with experience in developing scalable and responsive applications"
    
]

# Combine the resumes and job descriptions into one feature for prediction
new_texts = [f"{resume} {job_desc}" for resume, job_desc in zip(new_resumes, new_job_descs)]

# Make predictions on the new examples
predictions = loaded_model.predict(new_texts)

# Decode the predictions back to original labels
predicted_labels = label_encoder.inverse_transform(predictions)

# Print the results
for i, (resume, job_desc, label) in enumerate(zip(new_resumes, new_job_descs, predicted_labels)):
    print(f"Test Example {i+1}:")
    print(f"Resume: {resume}")
    print(f"Job Description: {job_desc}")
    print(f"Predicted Label: {label}\n")


Test Example 1:
Resume: Name: Laura Brooks, Contact: laura.brooks@example.com, Experience: Full-stack Developer, Skills: JavaScript, Node.js, React
Job Description: Looking for a full-stack developer with expertise in JavaScript, React, and Node.js for building modern web applications.
Predicted Label: Medium Fit

Test Example 2:
Resume: Name: Mark Green, Contact: mark.green@example.com, Experience: Data Scientist, Skills: Python, pandas, scikit-learn
Job Description: Seeking a data scientist with experience in Python, machine learning, and data analysis for building predictive models.
Predicted Label: Medium Fit

Test Example 3:
Resume: Name: Sara White, Contact: sara.white@example.com, Experience: Backend Developer, Skills: Java, Spring Boot
Job Description: Looking for a backend developer with expertise in Java and Spring Boot for building scalable backend systems.
Predicted Label: Fit

Test Example 4:
Resume: Name: James Lee, Contact: james.lee@example.com, Experience: Junior Devel