# PURPOSE OF THIS NOTEBOOK
This is the base code to write the functions to extract the raw text from resumes and append to our dataframe. 

TODO: Take the functions into a .py folder and use it as a script 

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from tokenizers import ByteLevelBPETokenizer
from transformers import BertTokenizer, BertModel
from pathlib import Path
from torch import cuda
import torch

from sklearn import metrics
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
# Setting up for GPU 

device = 'cuda' if cuda.is_available() else 'cpu'

## Model Goal

The goal is to train a custom BERT model to attempt to label each incoming job title to their proper ONET code. ONET (Occupational Information Network) is a free database that serves as a standardized taxonomy for jobs. Each job has a respective standardized name and code associated with it. This will make extracting skills quite easy if we can corrrectly translate job titles to the proper ONET code. Using BERT, we can tokenize the job titles and match them with the database of common job titles for each ONET code. For this first version I will be leaving out the actual ONET job names from the training data to compare later with an updated dataset. More information about ONET can be found here: https://www.onetonline.org/

In [4]:
# Import the train/test Data. 
test_df = pd.read_csv("../Data/TestingData.csv")
train_df = pd.read_csv("../Data/Training_Data.csv")
label_df = pd.read_csv("../Data/label_df.csv")

In [5]:
# Check that the test data incoming is correct
test_df.head()

Unnamed: 0,Reported_Jobs,Label
0,Chief Financial Officer (CFO),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Executive Vice President (EVP),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Bank President,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Bureau Chief,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Chief Administrative Officer,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
# Check that the training data incoming is correct 
train_df.head()

Unnamed: 0,Reported_Jobs,Label
0,Labor Standards Director,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Health Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Liquor Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,School Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Operations Vice President (Operations VP),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
train_df['Label'] = train_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

In [8]:
test_df['Label'] = test_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

## The goal of the tokenizer

The tokenizer will be tokenizing the job titles and the reported job titles, I don't know if I should do these seperately or together. In theory, I should have these pairings be tokenized together. **I SHOULD READ INTO THE TOKENIZER TO UNDERSTAND HOW I SHOULD APPROACH THIS** 

From looking at the reference code, I've learned that we need to follow these steps: 
1. Start with a train test split. **70% for the training data**, I will do the split based on the **70%** of each reported job title/ONET pairing.
    - Given the refernece notebook uses a dictionary as the input data and we are working with a dataframe instead, some major changes will be needed to be made in order for this to work. I don't think this would be difficult at all. Just need to translate the dictionary work to the dataframe. **I also need to confirm if the model input requires a list, dict, or dataframe object.**
2. Run the tokenizer on the training set 
3. Set up the model training and evaluation metrics.


In [9]:
# Load in the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
MAX_LEN = 175
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 1e-08

In [11]:
# Start by tokenizing the data.
# Will be using the class statment and slowly converting it for our needs 
class CustomDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.reported_jobs = dataframe.Reported_Jobs
        self.targets = self.data.Label
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reported_jobs)
    
    def __getitem__(self, index):
        # This can be done with a for loop 
        reported_job = str(self.reported_jobs[index])
        reported_job = " ".join(reported_job.split())

        inputs = self.tokenizer.encode_plus(
            reported_job,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [12]:
def predict(input, tokenizer, model, device):
    ''' GOAL OF THIS FUNCTION: 
    This function takes in any given string and converts it into a tokenized version that can be run through the model. '''

    input = " ".join(input.split())
    inputs = tokenizer.encode_plus(
        input,
        None,
        add_special_tokens=True,
        max_length=50,
        pad_to_max_length=True,
        return_token_type_ids=True
    )

    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    ids = torch.tensor(ids, dtype=torch.long)
    mask = torch.tensor(mask, dtype=torch.long)
    token_type_ids = torch.tensor(mask, dtype=torch.long)


    output = model(ids.unsqueeze(0), mask.unsqueeze(0), token_type_ids.unsqueeze(0))

    return output

In [13]:
# Running the tokenizer and shaping the dataframes for the model
train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
test_set = CustomDataset(test_df, tokenizer, MAX_LEN)

In [14]:
# Setting the train and test parameters 
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

training_loader = DataLoader(train_set, **train_params)
testing_loader = DataLoader(test_set, **test_params)

In [15]:
# Creatring the custom model

class BERTClass(torch.nn.Module):
    def __init__(self):
        # Defining the layers
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 997)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output_2 = self.l2(output_1)
        output_3 = self.l3(output_2)
        return output_3

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [16]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

# Fine tuning

In [17]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss= loss_fn(outputs, targets)
        if _%100==0:
            print(f'Epoch : {epoch}, Loss: {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return outputs

In [18]:
for epoch in range(EPOCHS):
    output = train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2it [00:00,  6.29it/s]

Epoch : 0, Loss: 0.7215086221694946


102it [00:22,  4.58it/s]

Epoch : 0, Loss: 0.7232333421707153


202it [00:44,  4.70it/s]

Epoch : 0, Loss: 0.7194452285766602


302it [01:06,  4.65it/s]

Epoch : 0, Loss: 0.7164679765701294


402it [01:28,  4.68it/s]

Epoch : 0, Loss: 0.7143262624740601


502it [01:50,  4.67it/s]

Epoch : 0, Loss: 0.7136077284812927


602it [02:12,  4.65it/s]

Epoch : 0, Loss: 0.7164649367332458


702it [02:34,  4.65it/s]

Epoch : 0, Loss: 0.7167539596557617


802it [02:56,  4.67it/s]

Epoch : 0, Loss: 0.7118066549301147


902it [03:18,  4.62it/s]

Epoch : 0, Loss: 0.7168064117431641


1002it [03:41,  4.60it/s]

Epoch : 0, Loss: 0.7131550312042236


1102it [04:03,  4.67it/s]

Epoch : 0, Loss: 0.7087031006813049


1202it [04:25,  4.73it/s]

Epoch : 0, Loss: 0.7116584181785583


1302it [04:47,  4.67it/s]

Epoch : 0, Loss: 0.7152790427207947


1402it [05:09,  4.62it/s]

Epoch : 0, Loss: 0.7111013531684875


1502it [05:31,  4.65it/s]

Epoch : 0, Loss: 0.71044921875


1602it [05:53,  4.68it/s]

Epoch : 0, Loss: 0.7099806666374207


1702it [06:15,  4.63it/s]

Epoch : 0, Loss: 0.709934651851654


1802it [06:37,  4.62it/s]

Epoch : 0, Loss: 0.7144860625267029


1902it [06:59,  4.66it/s]

Epoch : 0, Loss: 0.7104099988937378


2002it [07:22,  4.63it/s]

Epoch : 0, Loss: 0.709530234336853


2102it [07:44,  4.67it/s]

Epoch : 0, Loss: 0.7072334885597229


2202it [08:06,  4.66it/s]

Epoch : 0, Loss: 0.7068905830383301


2302it [08:28,  4.65it/s]

Epoch : 0, Loss: 0.7086963653564453


2402it [08:50,  4.63it/s]

Epoch : 0, Loss: 0.7100809216499329


2502it [09:12,  4.67it/s]

Epoch : 0, Loss: 0.706747829914093


2602it [09:34,  4.65it/s]

Epoch : 0, Loss: 0.7083114981651306


2702it [09:56,  4.66it/s]

Epoch : 0, Loss: 0.7094402313232422


2802it [10:18,  4.64it/s]

Epoch : 0, Loss: 0.7071502804756165


2902it [10:40,  4.64it/s]

Epoch : 0, Loss: 0.7038829922676086


3002it [11:02,  4.64it/s]

Epoch : 0, Loss: 0.7074740529060364


3102it [11:24,  4.66it/s]

Epoch : 0, Loss: 0.7048812508583069


3202it [11:46,  4.67it/s]

Epoch : 0, Loss: 0.704818069934845


3302it [12:08,  4.64it/s]

Epoch : 0, Loss: 0.7034968137741089


3402it [12:30,  4.66it/s]

Epoch : 0, Loss: 0.7075830698013306


3502it [12:52,  4.67it/s]

Epoch : 0, Loss: 0.7016879320144653


3602it [13:14,  4.62it/s]

Epoch : 0, Loss: 0.7035412788391113


3702it [13:36,  4.63it/s]

Epoch : 0, Loss: 0.7034775018692017


3802it [13:59,  4.62it/s]

Epoch : 0, Loss: 0.7036107182502747


3902it [14:21,  4.67it/s]

Epoch : 0, Loss: 0.7016074061393738


3933it [14:27,  4.53it/s]
2it [00:00,  5.96it/s]

Epoch : 1, Loss: 0.7059394121170044


102it [00:22,  4.64it/s]

Epoch : 1, Loss: 0.7035007476806641


202it [00:44,  4.65it/s]

Epoch : 1, Loss: 0.7023496031761169


302it [01:06,  4.64it/s]

Epoch : 1, Loss: 0.7030588984489441


402it [01:29,  4.67it/s]

Epoch : 1, Loss: 0.7006445527076721


502it [01:51,  4.70it/s]

Epoch : 1, Loss: 0.7011829018592834


602it [02:13,  4.61it/s]

Epoch : 1, Loss: 0.7030770182609558


702it [02:35,  4.67it/s]

Epoch : 1, Loss: 0.7006243467330933


802it [02:57,  4.64it/s]

Epoch : 1, Loss: 0.7015179991722107


902it [03:19,  4.68it/s]

Epoch : 1, Loss: 0.7017306685447693


1002it [03:41,  4.62it/s]

Epoch : 1, Loss: 0.7004213929176331


1102it [04:03,  4.61it/s]

Epoch : 1, Loss: 0.702751100063324


1202it [04:25,  4.63it/s]

Epoch : 1, Loss: 0.6985835433006287


1302it [04:48,  4.61it/s]

Epoch : 1, Loss: 0.7007578611373901


1402it [05:10,  4.65it/s]

Epoch : 1, Loss: 0.7027148008346558


1502it [05:32,  4.63it/s]

Epoch : 1, Loss: 0.6981827020645142


1602it [05:54,  4.62it/s]

Epoch : 1, Loss: 0.6997227072715759


1702it [06:16,  4.65it/s]

Epoch : 1, Loss: 0.7013256549835205


1802it [06:38,  4.65it/s]

Epoch : 1, Loss: 0.6989893317222595


1902it [07:00,  4.61it/s]

Epoch : 1, Loss: 0.6989799737930298


2002it [07:23,  4.64it/s]

Epoch : 1, Loss: 0.6970305442810059


2102it [07:45,  4.64it/s]

Epoch : 1, Loss: 0.6984114050865173


2202it [08:07,  4.65it/s]

Epoch : 1, Loss: 0.6950220465660095


2302it [08:29,  4.67it/s]

Epoch : 1, Loss: 0.6977546215057373


2402it [08:51,  4.62it/s]

Epoch : 1, Loss: 0.6958233714103699


2502it [09:14,  4.61it/s]

Epoch : 1, Loss: 0.6968614459037781


2602it [09:36,  4.68it/s]

Epoch : 1, Loss: 0.6962840557098389


2702it [09:58,  4.70it/s]

Epoch : 1, Loss: 0.6945679783821106


2802it [10:20,  4.66it/s]

Epoch : 1, Loss: 0.6963706016540527


2902it [10:42,  4.61it/s]

Epoch : 1, Loss: 0.6945245862007141


3002it [11:04,  4.61it/s]

Epoch : 1, Loss: 0.6937295198440552


3102it [11:26,  4.65it/s]

Epoch : 1, Loss: 0.6953639388084412


3202it [11:48,  4.65it/s]

Epoch : 1, Loss: 0.6950657963752747


3302it [12:10,  4.68it/s]

Epoch : 1, Loss: 0.6940190196037292


3402it [12:32,  4.72it/s]

Epoch : 1, Loss: 0.6943137645721436


3502it [12:55,  4.66it/s]

Epoch : 1, Loss: 0.6928164958953857


3602it [13:17,  4.69it/s]

Epoch : 1, Loss: 0.6940398216247559


3702it [13:39,  4.61it/s]

Epoch : 1, Loss: 0.6946626305580139


3802it [14:01,  4.70it/s]

Epoch : 1, Loss: 0.6912373900413513


3902it [14:23,  4.60it/s]

Epoch : 1, Loss: 0.6919978857040405


3933it [14:30,  4.52it/s]


## Model Evaluation and testing 

In [19]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _,data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [20]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

3372it [02:12, 25.41it/s]


Accuracy Score = 0.0
F1 Score (Micro) = 0.0019372587853767782
F1 Score (Macro) = 0.0016901472174244605


3372it [02:13, 25.32it/s]


Accuracy Score = 0.0
F1 Score (Micro) = 0.0019372587853767782
F1 Score (Macro) = 0.0016901472174244605


## Test Predictions with the model 

In [22]:
# To predict, the model is looking for the ids, mask and token_type_ids. Which means whatever we input must be run through the tokenizer first. 
input='Executive Director'
output = predict(input, tokenizer, model, device)

  token_type_ids = torch.tensor(mask, dtype=torch.long)


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
output_df = pd.DataFrame(output.cpu().detach().numpy().tolist(), columns=label_df.columns)

In [None]:
output_df

In [None]:
output_df.idxmax(axis='columns')

In [None]:
output_df.idxmin(axis=1)