In [72]:
import pandas as pd 
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
import torch
import re
from torch.utils.data import Dataset,DataLoader,RandomSampler,SequentialSampler

### 1. Data Preprocessing 

In [73]:
from torch import cuda
device='cuda' if cuda.is_available() else 'cpu' 

In [74]:
resumes = pd.read_csv("Resume.csv")

In [75]:
resumes.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [76]:
resumes.drop(columns=["ID","Resume_html"],inplace=True)

In [77]:
one_hot_encoded_category = pd.get_dummies(resumes['Category'])
one_hot_encoded_category  

Unnamed: 0,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,AVIATION,BANKING,BPO,BUSINESS-DEVELOPMENT,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2480,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2481,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2482,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [79]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [80]:
def basic_preprocessing(text):
  text=text.lower()
  text=re.sub(r'[^\w\s]','',text)
  text = re.sub(r'@\w+', '', text)
  return text 

In [82]:
resumes['resume_cleaned'] = resumes['Resume_str'].apply(basic_preprocessing) 
resumes['resume_cleaned'] = resumes['resume_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
import transformers
from transformers import BertTokenizer,BertModel,BertConfig 

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.resume = dataframe.resume_cleaned
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.resume)

    def __getitem__(self, index):
        resume = str(self.resume[index])
        resume = " ".join(resume.split())

        inputs = self.tokenizer.encode_plus(
            resume,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            return_attention_mask=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
resumes.head(5)

Unnamed: 0,Resume_str,Category,resume_cleaned
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR,hr administratormarketing associate hr adminis...
1,"HR SPECIALIST, US HR OPERATIONS ...",HR,hr specialist us hr operations summary versati...
2,HR DIRECTOR Summary Over 2...,HR,hr director summary 20 years experience recrui...
3,HR SPECIALIST Summary Dedica...,HR,hr specialist summary dedicated driven dynamic...
4,HR MANAGER Skill Highlights ...,HR,hr manager skill highlights hr skills hr depar...


In [None]:
new_df = pd.concat([resumes, one_hot_encoded_category], axis=1, join='inner')
display(new_df) 

Unnamed: 0,Resume_str,Category,resume_cleaned,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,AVIATION,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR,hr administratormarketing associate hr adminis...,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,"HR SPECIALIST, US HR OPERATIONS ...",HR,hr specialist us hr operations summary versati...,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,HR DIRECTOR Summary Over 2...,HR,hr director summary 20 years experience recrui...,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,HR SPECIALIST Summary Dedica...,HR,hr specialist summary dedicated driven dynamic...,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,HR MANAGER Skill Highlights ...,HR,hr manager skill highlights hr skills hr depar...,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,AVIATION,rank sgte5 non commissioned officer charge bri...,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2480,"GOVERNMENT RELATIONS, COMMUNICATIONS ...",AVIATION,government relations communications organizati...,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2481,GEEK SQUAD AGENT Professional...,AVIATION,geek squad agent professional profile support ...,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2482,PROGRAM DIRECTOR / OFFICE MANAGER ...,AVIATION,program director office manager summary highly...,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [None]:
new_df['list'] = new_df[new_df.columns[3:]].values.tolist()

In [None]:
new_df.drop(columns=["Resume_str"],inplace=True)

In [None]:
new_df['list']

0       [False, False, False, False, False, False, Fal...
1       [False, False, False, False, False, False, Fal...
2       [False, False, False, False, False, False, Fal...
3       [False, False, False, False, False, False, Fal...
4       [False, False, False, False, False, False, Fal...
                              ...                        
2479    [False, False, False, False, False, False, Tru...
2480    [False, False, False, False, False, False, Tru...
2481    [False, False, False, False, False, False, Tru...
2482    [False, False, False, False, False, False, Tru...
2483    [False, False, False, False, False, False, Tru...
Name: list, Length: 2484, dtype: object

### 2. Model Training

In [None]:
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (2484, 27)
TRAIN Dataset: (1987, 27)
TEST Dataset: (497, 27)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 24)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 

In [None]:
for epoch in range(10): 
  train(epoch)  

Epoch: 0, Loss:  0.7174732089042664
Epoch: 1, Loss:  0.19251281023025513
Epoch: 2, Loss:  0.17817960679531097
Epoch: 3, Loss:  0.17757664620876312
Epoch: 4, Loss:  0.15102693438529968
Epoch: 5, Loss:  0.12437310069799423
Epoch: 6, Loss:  0.10850457102060318
Epoch: 7, Loss:  0.08706565946340561
Epoch: 8, Loss:  0.06274766474962234
Epoch: 9, Loss:  0.08360558748245239


In [None]:
def validation(epoch):
    model.eval() 
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0): 
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long) 
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long) 
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
for epoch in range(1):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    zero_division=0
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7907444668008048
F1 Score (Micro) = 0.8543478260869565
F1 Score (Macro) = 0.7523431860921835


In [None]:
def save_model(model, filepath):
    torch.save(model.state_dict(), filepath) 

save_model(model, 'bert_model_trained.pth')

### 3. Testing phase 

In [83]:
def load_model(model_class, filepath):
    model = model_class()
    model.load_state_dict(torch.load(filepath))
    model.to(device)
    model.eval()
    return model

# Loading the trained model
loaded_model = load_model(BERTClass, 'bert_model_trained.pth') 

In [84]:
def predict_text_class(input_text, model):
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer.encode_plus(input_text, add_special_tokens=True, return_tensors="pt", max_length=512, truncation=True)
    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    with torch.no_grad():
        outputs = model(ids, mask, token_type_ids)
        probabilities = torch.softmax(outputs, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1).item()
    return predicted_label

# Example input text for prediction
input_text = "'hr administratormarketing associate hr administrator summary dedicated customer service manager 15 years experience hospitality customer service management respected builder leader customerfocused teams strives instill shared enthusiastic commitment customer service highlights focused customer satisfaction team management marketing savvy conflict resolution techniques training development skilled multitasker client relations specialist accomplishments missouri dot supervisor training certification certified ihg customer loyalty marketing segment hilton worldwide general manager training certification accomplished trainer cross server hospitality systems hilton onq micros opera pms fidelio opera reservation system ors holidex completed courses seminars customer service sales strategies inventory control loss prevention safety time management leadership performance assessment experience hr administratormarketing associate hr administrator dec 2013 current company name city state helps develop policies directs coordinates activities employment compensation labor relations benefits training employee services prepares employee separation notices related documentation keeps records benefits plans participation insurance pension plan personnel transactions hires promotions transfers performance reviews terminations employee statistics government reporting advises management appropriate resolution employee relations issues administers benefits programs life health dental insurance pension plans vacation sick leave leave absence employee assistance marketing associate designed created marketing collateral sales meetings trade shows company executives managed inhouse advertising program consisting print media collateral pieces assisted complete design launch companys website 2 months created official company page facebook facilitate interaction customers analyzed ratings programming features competitors evaluate effectiveness marketing strategies advanced medical claims analyst mar 2012 dec 2013 company name city state reviewed medical bills accuracy treatments tests hospital stays prior sanctioning claims trained interpret codes icd9 cpt terminology commonly used medical billing fully understand paperwork submitted healthcare providers required organizational analytical skills well computer skills knowledge medical terminology procedures statistics billing standards data analysis laws regarding medical billing assistant general manager jun 2010 dec 2010 company name city state performed duties including limited budgeting financial management accounting human resources payroll purchasing established maintained close working relationships departments hotel ensure maximum operation productivity morale guest service handled daily operations reported directly corporate office hired trained staff overall objectives goals emphasis high customer service marketing advertising working public relations media government local businesses chamber commerce executive support marketing assistant jul 2007 jun 2010 company name city state provided assistance various department heads executive marketing customer service human resources managed frontend operations ensure friendly efficient transactions ensured swift resolution customer issues preserve customer loyalty complying company policies exemplified secondtonone customer service delivery interactions customers potential clients reservation front office manager jun 2004 jul 2007 company name city state owner partner dec 2001 may 2004 company name city state price integrity coordinator aug 1999 dec 2001 company name city state education na business administration 1999 jefferson college city state business administration marketing advertising high school diploma college prep studies 1998 sainte genevieve senior high city state awarded american shrubel leadership scholarship jefferson college skills accounting ads advertising analytical skills benefits billing budgeting clients customer service data analysis delivery documentation employee relations financial management government relations human resources insurance labor relations layout marketing marketing collateral medical billing medical terminology office organizational payroll performance reviews personnel policies posters presentations public relations purchasing reporting statistics website'"
predicted_class = predict_text_class(input_text, loaded_model)
print("Predicted class:", predicted_class)  

Predicted class: 19


In [85]:
resumes['resume_cleaned'][0]

'hr administratormarketing associate hr administrator summary dedicated customer service manager 15 years experience hospitality customer service management respected builder leader customerfocused teams strives instill shared enthusiastic commitment customer service highlights focused customer satisfaction team management marketing savvy conflict resolution techniques training development skilled multitasker client relations specialist accomplishments missouri dot supervisor training certification certified ihg customer loyalty marketing segment hilton worldwide general manager training certification accomplished trainer cross server hospitality systems hilton onq micros opera pms fidelio opera reservation system ors holidex completed courses seminars customer service sales strategies inventory control loss prevention safety time management leadership performance assessment experience hr administratormarketing associate hr administrator dec 2013 current company name city state helps d

In [89]:
new_df[new_df.columns[3:]].columns[18]  

'HR'

In [87]:
temp = new_df.drop(columns=["Category","resume_cleaned","list"])

In [88]:
temp.head()

Unnamed: 0,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,AVIATION,BANKING,BPO,BUSINESS-DEVELOPMENT,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [90]:
features = temp.columns
features

Index(['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS',
       'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT',
       'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA',
       'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR',
       'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER'],
      dtype='object')

In [91]:
feature_mapping = {} 
for index, col in enumerate(features) :
    feature_mapping[index] = col  

In [92]:
print(feature_mapping)

{0: 'ACCOUNTANT', 1: 'ADVOCATE', 2: 'AGRICULTURE', 3: 'APPAREL', 4: 'ARTS', 5: 'AUTOMOBILE', 6: 'AVIATION', 7: 'BANKING', 8: 'BPO', 9: 'BUSINESS-DEVELOPMENT', 10: 'CHEF', 11: 'CONSTRUCTION', 12: 'CONSULTANT', 13: 'DESIGNER', 14: 'DIGITAL-MEDIA', 15: 'ENGINEERING', 16: 'FINANCE', 17: 'FITNESS', 18: 'HEALTHCARE', 19: 'HR', 20: 'INFORMATION-TECHNOLOGY', 21: 'PUBLIC-RELATIONS', 22: 'SALES', 23: 'TEACHER'}


In [93]:
feature_mapping[19] 

'HR'

In [96]:
resumes.head()

Unnamed: 0,Resume_str,Category,resume_cleaned
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR,hr administratormarketing associate hr adminis...
1,"HR SPECIALIST, US HR OPERATIONS ...",HR,hr specialist us hr operations summary versati...
2,HR DIRECTOR Summary Over 2...,HR,hr director summary 20 years experience recrui...
3,HR SPECIALIST Summary Dedica...,HR,hr specialist summary dedicated driven dynamic...
4,HR MANAGER Skill Highlights ...,HR,hr manager skill highlights hr skills hr depar...


In [120]:
resumes[resumes['Category'] == "HR"].resume_cleaned[0]

'hr administratormarketing associate hr administrator summary dedicated customer service manager 15 years experience hospitality customer service management respected builder leader customerfocused teams strives instill shared enthusiastic commitment customer service highlights focused customer satisfaction team management marketing savvy conflict resolution techniques training development skilled multitasker client relations specialist accomplishments missouri dot supervisor training certification certified ihg customer loyalty marketing segment hilton worldwide general manager training certification accomplished trainer cross server hospitality systems hilton onq micros opera pms fidelio opera reservation system ors holidex completed courses seminars customer service sales strategies inventory control loss prevention safety time management leadership performance assessment experience hr administratormarketing associate hr administrator dec 2013 current company name city state helps d

In [108]:
resumes['resume_cleaned'][0]

'hr administratormarketing associate hr administrator summary dedicated customer service manager 15 years experience hospitality customer service management respected builder leader customerfocused teams strives instill shared enthusiastic commitment customer service highlights focused customer satisfaction team management marketing savvy conflict resolution techniques training development skilled multitasker client relations specialist accomplishments missouri dot supervisor training certification certified ihg customer loyalty marketing segment hilton worldwide general manager training certification accomplished trainer cross server hospitality systems hilton onq micros opera pms fidelio opera reservation system ors holidex completed courses seminars customer service sales strategies inventory control loss prevention safety time management leadership performance assessment experience hr administratormarketing associate hr administrator dec 2013 current company name city state helps d