In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### Dataset

In [2]:
df = pd.read_csv("../data/dataset.csv")
df.head()

Unnamed: 0,Resume,Job_Description,Tfidf_Similarity,Jaccard_Similarity,Length_Ratio,No_of_Matched_Skills,No_of_Missing_Skills,Category,Score
0,resume_1.pdf,jd_1.txt,0.37,0.06,2.19,8,4,solid,76.0
1,resume_2.pdf,jd_1.txt,0.33,0.06,1.77,5,7,below-average,38.0
2,resume_3.pdf,jd_1.txt,0.31,0.03,1.62,0,12,poor,15.0
3,resume_4.pdf,jd_1.txt,0.38,0.06,1.84,7,5,solid,71.0
4,resume_5.pdf,jd_1.txt,0.33,0.05,1.35,0,12,poor,12.0


### Train test split

In [3]:
X = df.drop(columns=["Score", "Category", "Resume", "Job_Description"])
y = df["Score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Training

In [4]:
model = RandomForestRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

### Model Performance

In [5]:
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2: ", r2_score(y_test, y_pred))

MAE:  13.470499999999998
RMSE:  17.573730537367414
R^2:  0.573377899186363


## Hyperparameter Tuning

In [6]:
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],  # 'auto' removed
    'bootstrap': [True, False]
}

rf = RandomForestRegressor(random_state=42)

# Randomized Search CV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=100,  # number of parameter settings sampled
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,  # use all processors
    scoring='neg_mean_squared_error'  # or 'r2', 'neg_mean_absolute_error', etc.
)

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best Score (Neg MSE):", random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 50, 'bootstrap': True}
Best Score (Neg MSE): -290.47585134442534


In [7]:
best_model = random_search.best_estimator_
y_pred2 = best_model.predict(X_test)

In [8]:
print("MAE: ", mean_absolute_error(y_test, y_pred2))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred2)))
print("R^2: ", r2_score(y_test, y_pred2))

MAE:  13.960670538448483
RMSE:  17.021006151779744
R^2:  0.5997918934412787


### Prediction of New Data

In [13]:
sample = pd.DataFrame([{
  'Tfidf_Similarity': 0.4,
  'Jaccard_Similarity': 0.07,
  'Length_Ratio': 1,
  'No_of_Matched_Skills': 9,
  'No_of_Missing_Skills': 2,
}])

pred = best_model.predict(sample)
print("Prediction Score: ", pred[0])

Prediction Score:  81.44173515373517


### Saving the model

In [14]:
joblib.dump(model, "regressor_model.pkl")

['regressor_model.pkl']