In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

### Dataset

In [3]:
df = pd.read_csv("../data/dataset.csv")
df.head()

Unnamed: 0,Resume,Job_Description,Tfidf_Similarity,Bert_Similarity,No_of_Matched_Skills,No_of_Missing_Skills,Category,Score
0,resume_1.pdf,jd_1.txt,0.37,0.33,7,5,solid,76.0
1,resume_2.pdf,jd_1.txt,0.33,0.3,5,7,below-average,38.0
2,resume_3.pdf,jd_1.txt,0.31,0.11,0,12,poor,15.0
3,resume_4.pdf,jd_1.txt,0.38,0.47,7,5,solid,71.0
4,resume_5.pdf,jd_1.txt,0.33,0.25,0,12,poor,12.0


In [4]:
df.groupby("Category").size()

Category
average          35
below-average    37
perfect          31
poor             36
solid            61
dtype: int64

### Preprocessing

In [5]:
features = ['Tfidf_Similarity', 'Bert_Similarity', 'No_of_Matched_Skills', 'No_of_Missing_Skills', 'Score']
target = 'Category'

X = df[features]
y = df[target]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

### Model Training

In [6]:
model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

### Model Performance

In [7]:
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

print(classification_report(y_test_labels, y_pred_labels))
print(confusion_matrix(y_test_labels, y_pred_labels))

               precision    recall  f1-score   support

      average       1.00      0.86      0.92         7
below-average       1.00      1.00      1.00         8
      perfect       1.00      1.00      1.00         6
         poor       1.00      1.00      1.00         7
        solid       0.92      1.00      0.96        12

     accuracy                           0.97        40
    macro avg       0.98      0.97      0.98        40
 weighted avg       0.98      0.97      0.97        40

[[ 6  0  0  0  1]
 [ 0  8  0  0  0]
 [ 0  0  6  0  0]
 [ 0  0  0  7  0]
 [ 0  0  0  0 12]]


### Prediction of New data

In [8]:
sample = pd.DataFrame([{
  'Tfidf_Similarity': 0.35,
  'Bert_Similarity': 0.30,
  'No_of_Matched_Skills': 7,
  'No_of_Missing_Skills': 0,
  'Score': 80
}])

pred = model.predict(sample)
pred_label = label_encoder.inverse_transform(pred)
print("Prediction category: ", pred_label[0])

Prediction category:  solid


### Saving the model

In [17]:
joblib.dump(model, "classifier_model.pkl")
joblib.dump(label_encoder, "encoder.pkl")

['encoder.pkl']