In [1]:
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

### Dataset

In [2]:
df = pd.read_csv("../data/dataset.csv")
df.head()

Unnamed: 0,Resume,Job_Description,Tfidf_Similarity,Jaccard_Similarity,Length_Ratio,No_of_Matched_Skills,No_of_Missing_Skills,Category,Score
0,resume_1.pdf,jd_1.txt,0.37,0.06,2.19,8,4,solid,76.0
1,resume_2.pdf,jd_1.txt,0.33,0.06,1.77,5,7,below-average,38.0
2,resume_3.pdf,jd_1.txt,0.31,0.03,1.62,0,12,poor,15.0
3,resume_4.pdf,jd_1.txt,0.38,0.06,1.84,7,5,solid,71.0
4,resume_5.pdf,jd_1.txt,0.33,0.05,1.35,0,12,poor,12.0


In [3]:
df.groupby("Category").size()

Category
average          19
below-average    16
perfect          11
poor             21
solid            33
dtype: int64

### Preprocessing

In [4]:
features = ['Tfidf_Similarity', 'Jaccard_Similarity', 'Length_Ratio', 'No_of_Matched_Skills', 'No_of_Missing_Skills', 'Score']
target = 'Category'

X = df[features]
y = df[target]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

### Model Training

In [5]:
model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

### Model Performance

In [6]:
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

print(classification_report(y_test_labels, y_pred_labels))
print(confusion_matrix(y_test_labels, y_pred_labels))

               precision    recall  f1-score   support

      average       1.00      1.00      1.00         4
below-average       1.00      1.00      1.00         3
      perfect       1.00      0.50      0.67         2
         poor       1.00      1.00      1.00         4
        solid       0.88      1.00      0.93         7

     accuracy                           0.95        20
    macro avg       0.97      0.90      0.92        20
 weighted avg       0.96      0.95      0.94        20

[[4 0 0 0 0]
 [0 3 0 0 0]
 [0 0 1 0 1]
 [0 0 0 4 0]
 [0 0 0 0 7]]


### Prediction of New data

In [10]:
sample = pd.DataFrame([{
  'Tfidf_Similarity': 0.4,
  'Jaccard_Similarity': 0.07,
  'Length_Ratio': 1,
  'No_of_Matched_Skills': 9,
  'No_of_Missing_Skills': 2,
  'Score': 81.09
}])

pred = model.predict(sample)
pred_label = label_encoder.inverse_transform(pred)
print("Prediction category: ", pred_label[0])

Prediction category:  solid


### Saving the model

In [11]:
joblib.dump(model, "classifier_model.pkl")
joblib.dump(label_encoder, "encoder.pkl")

['encoder.pkl']