In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib

### Dataset

In [13]:
df = pd.read_csv("../data/dataset.csv")
df.head()

Unnamed: 0,Resume,Job_Description,Tfidf_Similarity,Bert_Similarity,No_of_Matched_Skills,No_of_Missing_Skills,Category,Score
0,resume_1.pdf,jd_1.txt,0.37,0.33,7,5,solid,76.0
1,resume_2.pdf,jd_1.txt,0.33,0.3,5,7,below-average,38.0
2,resume_3.pdf,jd_1.txt,0.31,0.11,0,12,poor,15.0
3,resume_4.pdf,jd_1.txt,0.38,0.47,7,5,solid,71.0
4,resume_5.pdf,jd_1.txt,0.33,0.25,0,12,poor,12.0


### Train test split

In [14]:
X = df.drop(columns=["Score", "Category", "Resume", "Job_Description"])
y = df["Score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Training

In [15]:
model = RandomForestRegressor(
  n_estimators=300,
  max_depth=10,
  max_features='sqrt',
  min_samples_leaf=2,
  min_samples_split=2,
  bootstrap=True,
  random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

### Model Performance

In [16]:
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2: ", r2_score(y_test, y_pred))

MAE:  12.789666411135162
RMSE:  16.926376816546885
R^2:  0.5945882091735402


### Prediction of New Data

In [19]:
sample = pd.DataFrame([{
  'Tfidf_Similarity': 0.35,
  'Bert_Similarity': 0.30,
  'No_of_Matched_Skills': 7,
  'No_of_Missing_Skills': 0,
}])

pred = model.predict(sample)
print("Prediction category: ", pred[0])

Prediction category:  80.84658730158732


### Saving the model

In [20]:
joblib.dump(model, "regressor_model.pkl")

['regressor_model.pkl']