# PURPOSE OF THIS NOTEBOOK
This is the base code to write the functions to extract the raw text from resumes and append to our dataframe. 

TODO: Take the functions into a .py folder and use it as a script 

In [5]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from tokenizers import ByteLevelBPETokenizer
from transformers import BertTokenizer, BertModel
from pathlib import Path
from torch import cuda
import torch

from sklearn import metrics
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [6]:
# Setting up for GPU 

device = 'cuda' if cuda.is_available() else 'cpu'

## Model Goal

The goal is to train a custom BERT model to attempt to label each incoming job title to their proper ONET code. ONET (Occupational Information Network) is a free database that serves as a standardized taxonomy for jobs. Each job has a respective standardized name and code associated with it. This will make extracting skills quite easy if we can corrrectly translate job titles to the proper ONET code. Using BERT, we can tokenize the job titles and match them with the database of common job titles for each ONET code. For this first version I will be leaving out the actual ONET job names from the training data to compare later with an updated dataset. More information about ONET can be found here: https://www.onetonline.org/

In [7]:
# Import the train/test Data. 
test_df = pd.read_csv("../Data/TestingData.csv")
train_df = pd.read_csv("../Data/Training_Data.csv")
label_df = pd.read_csv("../Data/label_df.csv")

In [8]:
# Check that the test data incoming is correct
test_df.head()

Unnamed: 0,Reported_Jobs,Label
0,Chief Diversity Officer (CDO),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Operations Vice President (Operations VP),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Agricultural Services Director,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Bureau Chief,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Business Development Executive,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
# Check that the training data incoming is correct 
train_df.head()

Unnamed: 0,Reported_Jobs,Label
0,Road Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Tax Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Deputy Insurance Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,School Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Aeronautics Commission Director,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
train_df['Label'] = train_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

In [11]:
test_df['Label'] = test_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

## The goal of the tokenizer

The tokenizer will be tokenizing the job titles and the reported job titles, I don't know if I should do these seperately or together. In theory, I should have these pairings be tokenized together. **I SHOULD READ INTO THE TOKENIZER TO UNDERSTAND HOW I SHOULD APPROACH THIS** 

From looking at the reference code, I've learned that we need to follow these steps: 
1. Start with a train test split. **70% for the training data**, I will do the split based on the **70%** of each reported job title/ONET pairing.
    - Given the refernece notebook uses a dictionary as the input data and we are working with a dataframe instead, some major changes will be needed to be made in order for this to work. I don't think this would be difficult at all. Just need to translate the dictionary work to the dataframe. **I also need to confirm if the model input requires a list, dict, or dataframe object.**
2. Run the tokenizer on the training set 
3. Set up the model training and evaluation metrics.


In [12]:
# Load in the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
MAX_LEN = 50
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

In [14]:
# Start by tokenizing the data.
# Will be using the class statment and slowly converting it for our needs 
class CustomDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.reported_jobs = dataframe.Reported_Jobs
        self.targets = self.data.Label
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reported_jobs)
    
    def __getitem__(self, index):
        # This can be done with a for loop 
        reported_job = str(self.reported_jobs[index])
        reported_job = " ".join(reported_job.split())

        inputs = self.tokenizer.encode_plus(
            reported_job,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [15]:
def predict(input, tokenizer, model, device):
    ''' GOAL OF THIS FUNCTION: 
    This function takes in any given string and converts it into a tokenized version that can be run through the model. '''

    input = " ".join(input.split())
    inputs = tokenizer.encode_plus(
        input,
        None,
        add_special_tokens=True,
        max_length=50,
        pad_to_max_length=True,
        return_token_type_ids=True
    )

    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    ids = torch.tensor(ids, dtype=torch.long)
    mask = torch.tensor(mask, dtype=torch.long)
    token_type_ids = torch.tensor(mask, dtype=torch.long)

    ids = ids.to(device, dtype = torch.long)
    mask = mask.to(device, dtype = torch.long)
    token_type_ids = token_type_ids.to(device, dtype= torch.long)


    output = model(ids.unsqueeze(0), mask.unsqueeze(0), token_type_ids.unsqueeze(0))

    return output

In [16]:
# Running the tokenizer and shaping the dataframes for the model
train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
test_set = CustomDataset(test_df, tokenizer, MAX_LEN)

In [17]:
# Setting the train and test parameters 
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

training_loader = DataLoader(train_set, **train_params)
testing_loader = DataLoader(test_set, **test_params)

In [18]:
# Creatring the custom model

class BERTClass(torch.nn.Module):
    def __init__(self):
        # Defining the layers
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 1013)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output_2 = self.l2(output_1)
        output_3 = self.l3(output_2)
        return output_3

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [19]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

# Fine tuning

In [20]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss= loss_fn(outputs, targets)
        if _%100==0:
            print(f'Epoch : {epoch}, Loss: {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return outputs

In [47]:
for epoch in range(5):
    output = train(epoch)

2it [00:00, 16.98it/s]

Epoch : 0, Loss: 0.008734955452382565


102it [00:09, 11.08it/s]

Epoch : 0, Loss: 0.008392097428441048


202it [00:18, 10.91it/s]

Epoch : 0, Loss: 0.008179834112524986


302it [00:27, 11.00it/s]

Epoch : 0, Loss: 0.008271665312349796


402it [00:36, 11.04it/s]

Epoch : 0, Loss: 0.008215700276196003


502it [00:45, 11.05it/s]

Epoch : 0, Loss: 0.007638862356543541


602it [00:54, 11.00it/s]

Epoch : 0, Loss: 0.007758060935884714


702it [01:03, 10.95it/s]

Epoch : 0, Loss: 0.008306981064379215


802it [01:12, 10.91it/s]

Epoch : 0, Loss: 0.007729521952569485


902it [01:21, 11.02it/s]

Epoch : 0, Loss: 0.0077315568923950195


1002it [01:30, 11.04it/s]

Epoch : 0, Loss: 0.008124046958982944


1102it [01:40, 11.06it/s]

Epoch : 0, Loss: 0.007443273440003395


1202it [01:49, 10.95it/s]

Epoch : 0, Loss: 0.007567903958261013


1302it [01:58, 11.00it/s]

Epoch : 0, Loss: 0.007020233664661646


1402it [02:07, 11.02it/s]

Epoch : 0, Loss: 0.008023226633667946


1502it [02:16, 11.02it/s]

Epoch : 0, Loss: 0.008084130473434925


1602it [02:25, 10.95it/s]

Epoch : 0, Loss: 0.008150202222168446


1702it [02:34, 11.02it/s]

Epoch : 0, Loss: 0.006130319554358721


1802it [02:43, 10.97it/s]

Epoch : 0, Loss: 0.007713750004768372


1902it [02:52, 11.00it/s]

Epoch : 0, Loss: 0.007344411686062813


2002it [03:01, 11.00it/s]

Epoch : 0, Loss: 0.007669353391975164


2102it [03:11, 11.00it/s]

Epoch : 0, Loss: 0.0077364216558635235


2202it [03:20, 10.97it/s]

Epoch : 0, Loss: 0.007892214693129063


2302it [03:29, 10.96it/s]

Epoch : 0, Loss: 0.007343204226344824


2402it [03:38, 10.90it/s]

Epoch : 0, Loss: 0.007108820136636496


2502it [03:47, 10.94it/s]

Epoch : 0, Loss: 0.00794265791773796


2602it [03:56, 11.03it/s]

Epoch : 0, Loss: 0.007603884674608707


2702it [04:05, 10.94it/s]

Epoch : 0, Loss: 0.007019030395895243


2802it [04:14, 11.03it/s]

Epoch : 0, Loss: 0.006963995285332203


2902it [04:23, 11.07it/s]

Epoch : 0, Loss: 0.006802164018154144


3002it [04:32, 11.07it/s]

Epoch : 0, Loss: 0.00738252978771925


3102it [04:41, 11.06it/s]

Epoch : 0, Loss: 0.007594530936330557


3202it [04:50, 11.07it/s]

Epoch : 0, Loss: 0.00794178806245327


3302it [04:59, 11.05it/s]

Epoch : 0, Loss: 0.008087171241641045


3402it [05:08, 10.99it/s]

Epoch : 0, Loss: 0.007123689167201519


3502it [05:18, 11.04it/s]

Epoch : 0, Loss: 0.007391843944787979


3602it [05:27, 11.06it/s]

Epoch : 0, Loss: 0.006804888602346182


3702it [05:36, 10.89it/s]

Epoch : 0, Loss: 0.007291825022548437


3802it [05:45, 11.04it/s]

Epoch : 0, Loss: 0.007290205918252468


3896it [05:53, 11.01it/s]
2it [00:00, 12.09it/s]

Epoch : 1, Loss: 0.007269499357789755


102it [00:09, 11.06it/s]

Epoch : 1, Loss: 0.007088578771799803


202it [00:18, 10.82it/s]

Epoch : 1, Loss: 0.006989582907408476


302it [00:27, 11.01it/s]

Epoch : 1, Loss: 0.007742597721517086


402it [00:36, 11.03it/s]

Epoch : 1, Loss: 0.007456704508513212


502it [00:45, 11.04it/s]

Epoch : 1, Loss: 0.0075083584524691105


602it [00:54, 11.00it/s]

Epoch : 1, Loss: 0.007857208140194416


702it [01:03, 11.02it/s]

Epoch : 1, Loss: 0.008037343621253967


802it [01:12, 11.05it/s]

Epoch : 1, Loss: 0.007177881430834532


902it [01:22, 10.93it/s]

Epoch : 1, Loss: 0.006924493703991175


1002it [01:31, 10.96it/s]

Epoch : 1, Loss: 0.007122979965060949


1102it [01:40, 10.84it/s]

Epoch : 1, Loss: 0.006787411402910948


1202it [01:49, 11.06it/s]

Epoch : 1, Loss: 0.006083558779209852


1302it [01:58, 11.07it/s]

Epoch : 1, Loss: 0.006995302625000477


1402it [02:07, 11.04it/s]

Epoch : 1, Loss: 0.006715845782309771


1502it [02:16, 11.05it/s]

Epoch : 1, Loss: 0.008007432334125042


1602it [02:25, 10.93it/s]

Epoch : 1, Loss: 0.007565306033939123


1702it [02:34, 11.02it/s]

Epoch : 1, Loss: 0.00748293986544013


1802it [02:43, 11.04it/s]

Epoch : 1, Loss: 0.007189516443759203


1902it [02:52, 11.01it/s]

Epoch : 1, Loss: 0.0076520428992807865


2002it [03:02, 11.02it/s]

Epoch : 1, Loss: 0.007251123897731304


2102it [03:11, 10.88it/s]

Epoch : 1, Loss: 0.007254051044583321


2202it [03:20, 10.84it/s]

Epoch : 1, Loss: 0.007565481122583151


2302it [03:29, 11.05it/s]

Epoch : 1, Loss: 0.007826189510524273


2402it [03:38, 10.90it/s]

Epoch : 1, Loss: 0.008069355972111225


2502it [03:47, 10.97it/s]

Epoch : 1, Loss: 0.006253544706851244


2602it [03:56, 11.07it/s]

Epoch : 1, Loss: 0.00702666724100709


2702it [04:06, 10.75it/s]

Epoch : 1, Loss: 0.007785482797771692


2802it [04:15, 10.99it/s]

Epoch : 1, Loss: 0.007513822987675667


2902it [04:24, 11.06it/s]

Epoch : 1, Loss: 0.007341225631535053


3002it [04:33, 10.96it/s]

Epoch : 1, Loss: 0.0073360418900847435


3102it [04:42, 10.98it/s]

Epoch : 1, Loss: 0.0074576265178620815


3202it [04:51, 10.91it/s]

Epoch : 1, Loss: 0.00779315922409296


3302it [05:00, 10.88it/s]

Epoch : 1, Loss: 0.007442554924637079


3402it [05:09, 11.05it/s]

Epoch : 1, Loss: 0.007596146315336227


3502it [05:18, 10.96it/s]

Epoch : 1, Loss: 0.006970682181417942


3602it [05:27, 10.92it/s]

Epoch : 1, Loss: 0.00763656385242939


3702it [05:37, 11.07it/s]

Epoch : 1, Loss: 0.007442579139024019


3802it [05:46, 11.07it/s]

Epoch : 1, Loss: 0.007493406999856234


3896it [05:54, 10.99it/s]
2it [00:00, 11.63it/s]

Epoch : 2, Loss: 0.006603078916668892


102it [00:09, 11.01it/s]

Epoch : 2, Loss: 0.0070262569934129715


202it [00:18, 10.85it/s]

Epoch : 2, Loss: 0.007036938332021236


302it [00:27, 11.02it/s]

Epoch : 2, Loss: 0.007752147037535906


402it [00:36, 10.94it/s]

Epoch : 2, Loss: 0.007848929613828659


502it [00:45, 10.90it/s]

Epoch : 2, Loss: 0.0067377714440226555


602it [00:54, 10.95it/s]

Epoch : 2, Loss: 0.007746501360088587


702it [01:03, 11.05it/s]

Epoch : 2, Loss: 0.007877840660512447


802it [01:13, 10.83it/s]

Epoch : 2, Loss: 0.0076117985881865025


902it [01:22, 11.00it/s]

Epoch : 2, Loss: 0.007752060890197754


1002it [01:31, 10.86it/s]

Epoch : 2, Loss: 0.006947260815650225


1102it [01:40, 10.86it/s]

Epoch : 2, Loss: 0.007325365673750639


1202it [01:49, 10.82it/s]

Epoch : 2, Loss: 0.006930449511855841


1302it [01:58, 10.90it/s]

Epoch : 2, Loss: 0.007371143437922001


1402it [02:07, 11.09it/s]

Epoch : 2, Loss: 0.006739831529557705


1502it [02:16, 10.99it/s]

Epoch : 2, Loss: 0.007361866999417543


1602it [02:25, 11.02it/s]

Epoch : 2, Loss: 0.0068563614040613174


1702it [02:34, 11.06it/s]

Epoch : 2, Loss: 0.007739235647022724


1802it [02:44, 10.83it/s]

Epoch : 2, Loss: 0.006834588944911957


1902it [02:53, 11.02it/s]

Epoch : 2, Loss: 0.007895134389400482


2002it [03:02, 10.95it/s]

Epoch : 2, Loss: 0.007419759873300791


2102it [03:11, 11.04it/s]

Epoch : 2, Loss: 0.007601296994835138


2202it [03:20, 11.04it/s]

Epoch : 2, Loss: 0.007764121983200312


2302it [03:29, 11.08it/s]

Epoch : 2, Loss: 0.007844640873372555


2402it [03:38, 11.09it/s]

Epoch : 2, Loss: 0.006650304887443781


2502it [03:47, 11.04it/s]

Epoch : 2, Loss: 0.007697587832808495


2602it [03:56, 11.05it/s]

Epoch : 2, Loss: 0.007463195826858282


2702it [04:05, 11.07it/s]

Epoch : 2, Loss: 0.007099718786776066


2802it [04:14, 11.03it/s]

Epoch : 2, Loss: 0.007960458286106586


2902it [04:23, 10.87it/s]

Epoch : 2, Loss: 0.0076490058563649654


3002it [04:32, 10.96it/s]

Epoch : 2, Loss: 0.007618090137839317


3102it [04:41, 11.06it/s]

Epoch : 2, Loss: 0.007247625384479761


3202it [04:50, 11.05it/s]

Epoch : 2, Loss: 0.007904710248112679


3302it [04:59, 11.04it/s]

Epoch : 2, Loss: 0.007026903796941042


3402it [05:09, 10.97it/s]

Epoch : 2, Loss: 0.007352622225880623


3502it [05:18, 10.86it/s]

Epoch : 2, Loss: 0.0073761404491961


3602it [05:27, 11.04it/s]

Epoch : 2, Loss: 0.007378048729151487


3702it [05:36, 11.06it/s]

Epoch : 2, Loss: 0.007192883174866438


3802it [05:45, 11.06it/s]

Epoch : 2, Loss: 0.0075769308023154736


3896it [05:53, 11.01it/s]
2it [00:00, 12.03it/s]

Epoch : 3, Loss: 0.0070502255111932755


102it [00:09, 11.01it/s]

Epoch : 3, Loss: 0.007448630873113871


202it [00:18, 10.95it/s]

Epoch : 3, Loss: 0.006677602417767048


302it [00:27, 10.76it/s]

Epoch : 3, Loss: 0.007529322523623705


402it [00:36, 10.98it/s]

Epoch : 3, Loss: 0.00704361405223608


502it [00:45, 10.87it/s]

Epoch : 3, Loss: 0.007821667939424515


602it [00:54, 11.03it/s]

Epoch : 3, Loss: 0.00728049548342824


702it [01:03, 10.94it/s]

Epoch : 3, Loss: 0.006493231747299433


802it [01:13, 11.01it/s]

Epoch : 3, Loss: 0.007204838562756777


902it [01:22, 10.85it/s]

Epoch : 3, Loss: 0.007706393487751484


1002it [01:31, 10.90it/s]

Epoch : 3, Loss: 0.005418006796389818


1102it [01:40, 10.85it/s]

Epoch : 3, Loss: 0.007112089544534683


1202it [01:49, 11.03it/s]

Epoch : 3, Loss: 0.00673047685995698


1302it [01:58, 11.06it/s]

Epoch : 3, Loss: 0.007684661075472832


1402it [02:07, 10.89it/s]

Epoch : 3, Loss: 0.007463829126209021


1502it [02:16, 11.05it/s]

Epoch : 3, Loss: 0.007237191777676344


1602it [02:25, 10.82it/s]

Epoch : 3, Loss: 0.007889693602919579


1702it [02:35, 10.98it/s]

Epoch : 3, Loss: 0.005292247980833054


1802it [02:44, 10.89it/s]

Epoch : 3, Loss: 0.007568619213998318


1902it [02:53, 11.03it/s]

Epoch : 3, Loss: 0.00642903707921505


2002it [03:02, 11.07it/s]

Epoch : 3, Loss: 0.007443724200129509


2102it [03:11, 11.05it/s]

Epoch : 3, Loss: 0.007088365498930216


2202it [03:20, 11.03it/s]

Epoch : 3, Loss: 0.007039432879537344


2302it [03:29, 11.02it/s]

Epoch : 3, Loss: 0.007580282166600227


2402it [03:38, 10.90it/s]

Epoch : 3, Loss: 0.006692973896861076


2502it [03:47, 11.02it/s]

Epoch : 3, Loss: 0.007407014723867178


2602it [03:56, 10.99it/s]

Epoch : 3, Loss: 0.006835448555648327


2702it [04:06, 10.85it/s]

Epoch : 3, Loss: 0.006990607362240553


2802it [04:15, 10.85it/s]

Epoch : 3, Loss: 0.006644013337790966


2902it [04:24, 11.03it/s]

Epoch : 3, Loss: 0.007468669209629297


3002it [04:33, 11.05it/s]

Epoch : 3, Loss: 0.007894167676568031


3102it [04:42, 10.84it/s]

Epoch : 3, Loss: 0.007734075654298067


3202it [04:51, 10.94it/s]

Epoch : 3, Loss: 0.006371422205120325


3302it [05:00, 11.03it/s]

Epoch : 3, Loss: 0.006457848008722067


3402it [05:09, 11.01it/s]

Epoch : 3, Loss: 0.0069913663901388645


3502it [05:18, 11.03it/s]

Epoch : 3, Loss: 0.0069442796520888805


3602it [05:27, 11.05it/s]

Epoch : 3, Loss: 0.007285879924893379


3702it [05:36, 11.02it/s]

Epoch : 3, Loss: 0.006372928619384766


3802it [05:45, 10.98it/s]

Epoch : 3, Loss: 0.007181120105087757


3896it [05:54, 10.99it/s]
2it [00:00, 11.98it/s]

Epoch : 4, Loss: 0.007151053287088871


102it [00:09, 11.06it/s]

Epoch : 4, Loss: 0.006551730912178755


202it [00:18, 11.07it/s]

Epoch : 4, Loss: 0.0073487223125994205


302it [00:27, 10.90it/s]

Epoch : 4, Loss: 0.006522167008370161


402it [00:36, 10.97it/s]

Epoch : 4, Loss: 0.005579862277954817


502it [00:45, 10.90it/s]

Epoch : 4, Loss: 0.007698907982558012


602it [00:54, 11.02it/s]

Epoch : 4, Loss: 0.006329857744276524


702it [01:03, 10.89it/s]

Epoch : 4, Loss: 0.006545398384332657


802it [01:12, 10.87it/s]

Epoch : 4, Loss: 0.006881468929350376


902it [01:21, 11.03it/s]

Epoch : 4, Loss: 0.007893446832895279


1002it [01:30, 11.05it/s]

Epoch : 4, Loss: 0.006314361933618784


1102it [01:40, 11.00it/s]

Epoch : 4, Loss: 0.007999123074114323


1202it [01:49, 10.97it/s]

Epoch : 4, Loss: 0.007014593109488487


1302it [01:58, 10.95it/s]

Epoch : 4, Loss: 0.0076602064073085785


1402it [02:07, 10.94it/s]

Epoch : 4, Loss: 0.006494924426078796


1502it [02:16, 11.04it/s]

Epoch : 4, Loss: 0.006230408791452646


1602it [02:25, 11.01it/s]

Epoch : 4, Loss: 0.007279449608176947


1702it [02:34, 10.87it/s]

Epoch : 4, Loss: 0.00669587217271328


1802it [02:43, 10.87it/s]

Epoch : 4, Loss: 0.006873197853565216


1902it [02:53, 10.91it/s]

Epoch : 4, Loss: 0.007033360190689564


2002it [03:02, 11.06it/s]

Epoch : 4, Loss: 0.006307728588581085


2102it [03:11, 11.06it/s]

Epoch : 4, Loss: 0.00729407649487257


2202it [03:20, 11.06it/s]

Epoch : 4, Loss: 0.006159199867397547


2302it [03:29, 10.91it/s]

Epoch : 4, Loss: 0.006869780831038952


2402it [03:38, 11.10it/s]

Epoch : 4, Loss: 0.00727071100845933


2502it [03:47, 10.96it/s]

Epoch : 4, Loss: 0.00681287469342351


2602it [03:56, 11.07it/s]

Epoch : 4, Loss: 0.0069485981948673725


2702it [04:05, 11.07it/s]

Epoch : 4, Loss: 0.007100400980561972


2802it [04:14, 11.01it/s]

Epoch : 4, Loss: 0.0066671064123511314


2902it [04:23, 11.08it/s]

Epoch : 4, Loss: 0.006874407641589642


3002it [04:32, 11.06it/s]

Epoch : 4, Loss: 0.007076323963701725


3102it [04:41, 11.00it/s]

Epoch : 4, Loss: 0.007058304268866777


3202it [04:50, 11.05it/s]

Epoch : 4, Loss: 0.006726714316755533


3302it [04:59, 11.05it/s]

Epoch : 4, Loss: 0.006790888030081987


3402it [05:09, 11.01it/s]

Epoch : 4, Loss: 0.004903183784335852


3502it [05:18, 10.79it/s]

Epoch : 4, Loss: 0.0073953247629106045


3602it [05:27, 10.85it/s]

Epoch : 4, Loss: 0.007822282612323761


3702it [05:36, 10.83it/s]

Epoch : 4, Loss: 0.006699944846332073


3802it [05:45, 10.88it/s]

Epoch : 4, Loss: 0.006843562237918377


3896it [05:54, 11.00it/s]


## Model Evaluation and testing 

In [48]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _,data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [49]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

3345it [01:03, 53.07it/s]


Accuracy Score = 0.015546752373122056
F1 Score (Micro) = 0.030509717638430505
F1 Score (Macro) = 0.001133318803605082


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


## Test Predictions with the model 

In [60]:
# To predict, the model is looking for the ids, mask and token_type_ids. Which means whatever we input must be run through the tokenizer first. 
input='Fry Cook'
output = predict(input, tokenizer, model, device)

  token_type_ids = torch.tensor(mask, dtype=torch.long)


In [61]:
output_df = pd.DataFrame(output.cpu().detach().numpy().tolist(), columns=label_df.columns)

In [62]:
output_df

Unnamed: 0,11-1011.00,11-1011.03,11-1021.00,11-1031.00,11-2011.00,11-2021.00,11-2022.00,11-2032.00,11-2033.00,11-3012.00,...,55-2012.00,55-2013.00,55-3011.00,55-3012.00,55-3013.00,55-3014.00,55-3015.00,55-3016.00,55-3018.00,55-3019.00
0,-6.180187,-7.541752,-5.77798,-6.920471,-6.31028,-6.354553,-7.036899,-7.932647,-7.880801,-7.645444,...,-8.941263,-6.809411,-7.129673,-8.747339,-7.603312,-6.659719,-7.304749,-7.432025,-7.311785,-6.093854


In [63]:
output_df.idxmax(axis='columns')

0    41-2031.00
dtype: object

In [64]:
output_df.idxmin(axis=1)

0    19-2099.00
dtype: object