In [None]:
import numpy as np 
import pandas as pd
import os
from matplotlib import pyplot as plt

# Reading the data and storing it into a dictionary

In [None]:
root='/kaggle/input/resume-dataset/data/data/'
data={}
for i in os.listdir(root):
    for j in os.walk(root+i):
        for k in j[2]:
            data[root+i+'/'+k]=i

# Converting data dictionary to pandas dataframe

In [None]:
data=pd.DataFrame(data.items(),columns=['file','type'])
data.head()

# Data Distribution Visualization

In [None]:
count_=data['type'].value_counts()
count_.plot(kind='bar', color='r', alpha=.5)

# Reading Data from PDF Files

In [None]:
! pip install tika
from tika import parser 

In [None]:
texts_cv=[]
empty_cv=[]
for i in range(len(data)):
    raw = parser.from_file(data.iloc[i]['file'])
    try:
        texts_cv.append(raw['content'].replace('\n',' '))
    except:
        texts_cv.append('')
        empty_cv.append(i)
        

In [None]:
texts_cv[0] # sample cv text

In [None]:
len(empty_cv)

In [None]:
data['text']=texts_cv

# Dropping Empty CV

In [None]:
data=data.drop(empty_cv)

# EDA on CV Content

In [None]:
cv_length=[len(i) for i in texts_cv]

In [None]:
plt.hist(cv_length)
plt.xlabel('cv sentence length')
plt.ylabel('occurrences')
plt.title('CV length')

In [None]:
import statistics
mean = sum(cv_length) / len(cv_length) #mean
std = statistics.pstdev(cv_length) #Standard deviation
mean,std

# Out of the range of (Mean +- 3*std) is considered outlier

In [None]:
vocab_size= int(mean+ (3*std))
vocab_size

# Data Splitting

In [None]:
def split_data(data,ratio):
    last=int(len(data)*ratio)
    return data[:last], data[last:]

data=data.sample(frac=1)
train,test=split_data(data,.8)
validation,train=split_data(data,.08)
# train:validation:test=70:10:20

## Making sure each subset has all the classes

In [None]:
print(len(train["type"].value_counts()))
print(len(test["type"].value_counts()))
print(len(validation["type"].value_counts()))

In [None]:
X_train=train['text']
X_test=test['text']
X_validation=validation['text']

y_train=train['type']
y_test=test['type']
y_validation=validation['type']

# Traditional ML models on raw data

## First Try with TF-IDF vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer #Term Frequency Inverse Document Frequency
vectorizer = TfidfVectorizer(ngram_range=(1,5),max_features=vocab_size)
X_train_tfidf = vectorizer.fit_transform(X_train)

In [None]:
X_validation_tfidf=vectorizer.transform(X_validation)

In [None]:
from sklearn import metrics
def score_prediction(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    pr = model.predict(X_test)
    acc_score = metrics.accuracy_score(y_test,pr)
    pre_score = metrics.precision_score(y_test,pr,average="weighted")
    recall= metrics.recall_score(y_test,pr,average="weighted")
    f1= metrics.f1_score(y_test,pr,average="weighted")
    mcc= metrics.matthews_corrcoef(y_test,pr)
    return acc_score,pre_score,recall,f1,mcc
acc_score = {}
pre_score = {}
recall_score={}
f1_score={}
mcc_score={}

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier


lg = LogisticRegression(penalty='l1',solver='liblinear')
sv = SVC(kernel='sigmoid',gamma=1.0)
dtc = DecisionTreeClassifier(max_depth=5)
knn = KNeighborsClassifier()
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
etc = ExtraTreesClassifier(n_estimators=50,random_state=2)
bg = BaggingClassifier(n_estimators=50,random_state=2)
gbc = GradientBoostingClassifier(n_estimators=50,random_state=2)


clfs= {
    'LR':lg,
    'SVM':sv,
    'DTC':dtc,
    'KNN':knn,
    'RFC':rfc,
    'ETC':etc,
    'BG':bg,
    'GBC':gbc,
}
for name,clf in clfs.items():
    acc_score[name],pre_score[name],recall_score[name],f1_score[name],mcc_score[name] = score_prediction(clf,X_train_tfidf,X_validation_tfidf,y_train,y_validation)

## Result Analysis

In [None]:
acc_score

In [None]:
pre_score

In [None]:
recall_score

In [None]:
f1_score

In [None]:
mcc_score

In [None]:
lists = sorted(acc_score.items()) # sorted by key, return a list of tuples

x, y = zip(*lists) # unpack a list of pairs into two tuples

plt.bar( x, y)
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.show()

In [None]:
lists = sorted(mcc_score.items()) # sorted by key, return a list of tuples

x, y = zip(*lists) # unpack a list of pairs into two tuples

plt.bar( x, y)
plt.xlabel('Model')
plt.ylabel('MCC')
plt.show()

## Gradient Boosting is the best ml model for this problem

# Experiment with data cleaning

## Delete Stop words

In [None]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,5),max_features=vocab_size,stop_words=stopwords.words('english'))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_validation_tfidf=vectorizer.transform(X_validation)

## Since Gradient Boosting had the heighest performance

In [None]:
score_prediction(gbc,X_train_tfidf,X_validation_tfidf,y_train,y_validation)

## Deleting stop words upgraded the performance

# Keep Alphabetic Characters Only

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,5),max_features=vocab_size,stop_words=stopwords.words('english'),token_pattern=r'(?u)\b[A-Za-z]+\b')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_validation_tfidf=vectorizer.transform(X_validation)

In [None]:
score_prediction(gbc,X_train_tfidf,X_validation_tfidf,y_train,y_validation)

## This made the performance worse

# Bag-of-words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,5),max_features=vocab_size,stop_words=stopwords.words('english'))
X_train_bow = vectorizer.fit_transform(X_train)
X_validation_bow=vectorizer.transform(X_validation)

In [None]:
score_prediction(gbc,X_train_bow,X_validation_bow,y_train,y_validation)

## TF-IDF is slightly better than BoW

# Try Deep Learning

## We will continue with the best configeration that is TF-IDF with deleting stop words

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,5),max_features=vocab_size,stop_words=stopwords.words('english'))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_validation_tfidf=vectorizer.transform(X_validation)

X_train_tfidf_arr=X_train_tfidf.toarray()
y_train_ohe= pd.get_dummies(y_train).values

X_validation_tfidf_arr=X_validation_tfidf.toarray()
y_validation_ohe= pd.get_dummies(y_validation).values

In [None]:
input_sp=len(X_train_tfidf_arr[0])
out_sp=len(y_validation_ohe[0])

## Multi-layer Perceptron

In [None]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,LSTM
from sklearn import metrics
model = Sequential()
model.add(Dense(2048,input_shape = (input_sp,), activation = 'relu'))
model.add(Dense(1024, activation = 'relu'))
model.add(Dense(512, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(out_sp,activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy',])
history= model.fit(X_train_tfidf_arr, y_train_ohe, epochs=50, batch_size=128,verbose=2)

In [None]:
epochs = list(range(len(history.history['accuracy'])))
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']


fig.set_size_inches(20,10)

ax[0].plot(epochs , train_acc , 'go-' ,color='r', label = 'Training Accuracy')
ax[0].set_title('Training Accuracy')
ax[0].legend()
ax[0].set_xlabel("Epochs")
ax[0].set_ylabel("Accuracy")


ax[1].plot(epochs , train_loss , 'g-o' ,color='r', label = 'Training Loss')
ax[1].set_title('Training Loss')
ax[1].legend()
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Training Loss")
plt.show()

In [None]:
proba=model.predict(X_validation_tfidf_arr)
pr = np.argmax(proba,axis=1)
true=np.argmax(y_validation_ohe,axis=1)

acc_score = metrics.accuracy_score(true,pr)
pre_score = metrics.precision_score(true,pr,average="weighted")
recall= metrics.recall_score(true,pr,average="weighted")
f1= metrics.f1_score(true,pr,average="weighted")
mcc= metrics.matthews_corrcoef(true,pr)
acc_score,pre_score,recall,f1,mcc

## Model overfitted. Adding Dropout

In [None]:
from tensorflow.keras.layers import Dropout
model = Sequential()
model.add(Dense(2048,input_shape = (input_sp,), activation = 'relu'))
model.add(Dropout(.5))
model.add(Dense(1024, activation = 'relu'))
model.add(Dropout(.4))
model.add(Dense(512, activation = 'relu'))
model.add(Dropout(.3))
model.add(Dense(256, activation = 'relu'))
model.add(Dropout(.2))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(.1))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(out_sp,activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy',])
history= model.fit(X_train_tfidf_arr, y_train_ohe, epochs=50, batch_size=128,verbose=2)

In [None]:
proba=model.predict(X_validation_tfidf_arr)
pr = np.argmax(proba,axis=1)
true=np.argmax(y_validation_ohe,axis=1)

acc_score = metrics.accuracy_score(true,pr)
pre_score = metrics.precision_score(true,pr,average="weighted")
recall= metrics.recall_score(true,pr,average="weighted")
f1= metrics.f1_score(true,pr,average="weighted")
mcc= metrics.matthews_corrcoef(true,pr)
acc_score,pre_score,recall,f1,mcc

## MLP Didn't work

# Bi-LSTM

In [None]:
# Functional API
from tensorflow.keras import models, layers, optimizers
input_ = layers.Input(shape =(input_sp,),name='input')
x=layers.Reshape((input_sp, 1), input_shape = (input_sp, ))(input_)
x = layers.Bidirectional(layers.LSTM(15,dropout=0.2, return_sequences=False),name='bidirectional-lstm')(x) 
x = layers.Dropout(0.2, name='dropout')(x)
x = layers.Dense(64, activation='relu', name='dense')(x)
output = layers.Dense(out_sp,activation='softmax', name='classification')(x)

model = models.Model(input_, output)

opt = optimizers.Adam(learning_rate=0.01) # because bi-lstms are slow, cannot affort high epoch, therefore higher learning rate for faster convergence 
model.compile(optimizer=opt,loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history= model.fit(X_train_tfidf_arr, y_train_ohe, epochs=50, batch_size=128,verbose=2)

In [None]:
proba=model.predict(X_validation_tfidf_arr)
pr = np.argmax(proba,axis=1)
true=np.argmax(y_validation_ohe,axis=1)

acc_score = metrics.accuracy_score(true,pr)
pre_score = metrics.precision_score(true,pr,average="weighted")
recall= metrics.recall_score(true,pr,average="weighted")
f1= metrics.f1_score(true,pr,average="weighted")
mcc= metrics.matthews_corrcoef(true,pr)
acc_score,pre_score,recall,f1,mcc

# Transformers

In [None]:
import transformers
from datasets import Dataset
from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
train_dataset = Dataset.from_pandas(train)
valid_dataset = Dataset.from_pandas(validation)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(train['type'].unique()))

In [None]:
! pip install evaluate
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)

In [None]:
labels = list(set(train['type'].to_list()))
label_count = len(labels)

In [None]:
def categorize(x):
  return {"labels": [labels.index(type_) for type_ in x['type']]}

In [None]:
categorized_train = tokenized_train.map(categorize, batched=True)
categorized_train

In [None]:
categorized_valid = tokenized_valid.map(categorize, batched=True)
categorized_valid

In [None]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=categorized_train,
    eval_dataset=categorized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    
)


## Test with freezing layers (if needed)

In [None]:
i=0
for name, param in model.named_parameters(): 
    i+=1
    if(i>100):
        break
    param.requires_grad = False

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(categorized_valid)

In [None]:
pr = np.argmax(predictions.predictions, axis=-1)

In [None]:
true=predictions.label_ids
acc_score = metrics.accuracy_score(true,pr)
pre_score = metrics.precision_score(true,pr,average="weighted")
recall= metrics.recall_score(true,pr,average="weighted")
f1= metrics.f1_score(true,pr,average="weighted")
mcc= metrics.matthews_corrcoef(true,pr)
acc_score,pre_score,recall,f1,mcc

In [None]:
trainer.save_model('cv-classifier')

# Best Option is to use Gradient Boosting with TF-IDF vectorizer with stop-words removal

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,5),max_features=vocab_size,stop_words=stopwords.words('english'))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_validation_tfidf = vectorizer.transform(X_validation)
X_test_ifidf = vectorizer.transform(X_test)

## Saving the datasets

In [None]:
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)
validation.to_csv('validation.csv',index=False)

# Final Evaluation of the best model on test dataset

In [None]:
gbc = GradientBoostingClassifier(n_estimators=50,random_state=2)
gbc.fit(X_train_tfidf,y_train)

In [None]:
pr = gbc.predict(X_test_ifidf)
acc_score = metrics.accuracy_score(y_test,pr)
pre_score = metrics.precision_score(y_test,pr,average="weighted")
recall= metrics.recall_score(y_test,pr,average="weighted")
f1= metrics.f1_score(y_test,pr,average="weighted")
mcc= metrics.matthews_corrcoef(y_test,pr)
print("accuracy: " + str(acc_score)+" precision: "+ str(pre_score) + " recall: "+ str(recall)+ " f1-score: "+str(f1)+" mcc: "+str(mcc))

# Saving the model

In [None]:
import pickle
pickle.dump(gbc, open('gradient_boosting.sav', 'wb'))

### If needed, load the model this way:

In [None]:
filename='/kaggle/working/gradient_boosting.sav'
gbc_load=pickle.load(open(filename, 'rb'))

### Note that results might vary by a small margin due to the randomization involved