In [1]:
import pandas as pd

import joblib

from sklearn.metrics import accuracy_score, classification_report

from preprocess import preprocess, get_vectorizer

In [2]:
resume_dataset = pd.read_csv('Labelled_Resume_Dataset.csv')
resume_dataset

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \n MCA YMCAUST, Faridabad..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [3]:
X = resume_dataset['Resume']
y = resume_dataset['Category']
preprocessed_resumes = preprocess(X)    # Preprocessing the resumes

vectorizer = get_vectorizer()               
X = vectorizer.fit_transform(preprocessed_resumes)  # vectorizing the resumes
X = X.toarray()

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80, random_state=10)

In [5]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier()
random_forest_classifier.fit(X_train, y_train)

RandomForestClassifier()

In [6]:
from sklearn.svm import LinearSVC

linear_SVC = LinearSVC()
linear_SVC.fit(X_train,y_train)

LinearSVC()

In [7]:
from sklearn.linear_model import LogisticRegression

log_regressor = LogisticRegression()
log_regressor.fit(X_train,y_train)

LogisticRegression()

In [8]:
accuracies = dict()

In [9]:
random_forest_classifier_y_pred = random_forest_classifier.predict(X_test)

report = classification_report(y_test, random_forest_classifier_y_pred)
accuracy = accuracy_score(y_test, random_forest_classifier_y_pred)

# saving the model and accuracy to a dict
accuracies['random_forest'] = {'accuracy':accuracy, 'model': random_forest_classifier}
print(f'Random Fores: \n{report}')

Random Fores: 
                           precision    recall  f1-score   support

                 Advocate       1.00      0.62      0.77        16
                     Arts       1.00      1.00      1.00        30
       Automation Testing       0.94      0.89      0.92        19
               Blockchain       1.00      1.00      1.00        32
         Business Analyst       0.83      1.00      0.90        19
           Civil Engineer       1.00      0.13      0.23        23
             Data Science       1.00      0.67      0.80        36
                 Database       0.87      1.00      0.93        26
          DevOps Engineer       1.00      0.91      0.95        44
         DotNet Developer       1.00      0.83      0.91        24
            ETL Developer       1.00      1.00      1.00        31
   Electrical Engineering       1.00      0.76      0.86        25
                       HR       0.44      1.00      0.61        29
                   Hadoop       0.88      1.00

In [10]:
linear_SVC_y_pred = linear_SVC.predict(X_test)

report = classification_report(y_test, linear_SVC_y_pred)
accuracy = accuracy_score(y_test, linear_SVC_y_pred)

# saving the model and accuracy to a dict
accuracies['linear_SVC'] = {'accuracy':accuracy, 'model': linear_SVC}
print(f'Linear SVC: \n{report}')

Linear SVC: 
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00        16
                     Arts       1.00      0.60      0.75        30
       Automation Testing       0.61      0.89      0.72        19
               Blockchain       1.00      1.00      1.00        32
         Business Analyst       1.00      1.00      1.00        19
           Civil Engineer       1.00      0.83      0.90        23
             Data Science       0.82      1.00      0.90        36
                 Database       1.00      1.00      1.00        26
          DevOps Engineer       1.00      0.91      0.95        44
         DotNet Developer       0.71      1.00      0.83        24
            ETL Developer       1.00      1.00      1.00        31
   Electrical Engineering       0.93      1.00      0.96        25
                       HR       1.00      0.86      0.93        29
                   Hadoop       1.00      1.00  

In [11]:
log_regressor_y_pred = log_regressor.predict(X_test)

report = classification_report(y_test, log_regressor_y_pred)
accuracy = accuracy_score(y_test, log_regressor_y_pred)

# saving the model and accuracy to a dict
accuracies['log_regressor'] = {'accuracy':accuracy, 'model': log_regressor}
print(f'Logistic Regression: \n{report}')

Logistic Regression: 
                           precision    recall  f1-score   support

                 Advocate       1.00      0.12      0.22        16
                     Arts       1.00      0.40      0.57        30
       Automation Testing       1.00      0.89      0.94        19
               Blockchain       1.00      1.00      1.00        32
         Business Analyst       1.00      0.89      0.94        19
           Civil Engineer       0.00      0.00      0.00        23
             Data Science       1.00      0.17      0.29        36
                 Database       1.00      1.00      1.00        26
          DevOps Engineer       1.00      0.91      0.95        44
         DotNet Developer       1.00      0.50      0.67        24
            ETL Developer       1.00      1.00      1.00        31
   Electrical Engineering       1.00      0.76      0.86        25
                       HR       0.64      0.86      0.74        29
                   Hadoop       0.72   

In [12]:
for k in accuracies.keys():
    print(f'{k} has an accuracy of {accuracies[k].get("accuracy")}')

random_forest has an accuracy of 0.8792207792207792
linear_SVC has an accuracy of 0.9311688311688312
log_regressor has an accuracy of 0.712987012987013


In [13]:
max_accuracy_model = max(accuracies, key=lambda x: accuracies.get(x)['accuracy'])   # Getting max accuracy
joblib.dump(accuracies[max_accuracy_model]['model'], 'model.joblib')                # saving the model with max accuracy    
joblib.dump(vectorizer, 'vectorizer.joblib')                                        # saving the vectorizer

['vectorizer.joblib']