# PURPOSE OF THIS NOTEBOOK
This is the base code to write the functions to extract the raw text from resumes and append to our dataframe. 

TODO: Take the functions into a .py folder and use it as a script 

In [19]:
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
from tokenizers import Tokenizer
from transformers import BertTokenizer, BertModel
from pathlib import Path
from torch import cuda
import torch

from sklearn import metrics
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [20]:
# Setting up for GPU 

device = 'cuda' if cuda.is_available() else 'cpu'

## Model Goal

The goal is to train a custom BERT model to attempt to label each incoming job title to their proper ONET code. ONET (Occupational Information Network) is a free database that serves as a standardized taxonomy for jobs. Each job has a respective standardized name and code associated with it. This will make extracting skills quite easy if we can corrrectly translate job titles to the proper ONET code. Using BERT, we can tokenize the job titles and match them with the database of common job titles for each ONET code. For this first version I will be leaving out the actual ONET job names from the training data to compare later with an updated dataset. More information about ONET can be found here: https://www.onetonline.org/

In [21]:
# Import the train/test Data. 
test_df = pd.read_csv("../Data/TestingData.csv")
train_df = pd.read_csv("../Data/Training_Data.csv")
label_df = pd.read_csv("../Data/label_df.csv")

In [22]:
# Check that the test data incoming is correct
test_df.head()

Unnamed: 0,Reported_Jobs,Label
0,General and Operations Managers,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Marketing Managers,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Public Relations Managers,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Financial Managers,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Treasurers and Controllers,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [23]:
# Check that the training data incoming is correct 
train_df.head()

Unnamed: 0,Reported_Jobs,Label
0,"Farmers, Ranchers, and Other Agricultural Mana...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Supply Chain Planning Manager,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Sales Vice President (Sales VP),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Special Programs Director,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Offshore Wind Operations Manager,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [24]:
train_df['Label'] = train_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

In [25]:
test_df['Label'] = test_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

## The goal of the tokenizer

The tokenizer will be tokenizing the job titles and the reported job titles, I don't know if I should do these seperately or together. In theory, I should have these pairings be tokenized together. **I SHOULD READ INTO THE TOKENIZER TO UNDERSTAND HOW I SHOULD APPROACH THIS** 

From looking at the reference code, I've learned that we need to follow these steps: 
1. Start with a train test split. **70% for the training data**, I will do the split based on the **70%** of each reported job title/ONET pairing.
    - Given the refernece notebook uses a dictionary as the input data and we are working with a dataframe instead, some major changes will be needed to be made in order for this to work. I don't think this would be difficult at all. Just need to translate the dictionary work to the dataframe. **I also need to confirm if the model input requires a list, dict, or dataframe object.**
2. Run the tokenizer on the training set 
3. Set up the model training and evaluation metrics.


In [26]:
# Load in the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [27]:
MAX_LEN = 175
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 2e-5

In [28]:
# Start by tokenizing the data.
# Will be using the class statment and slowly converting it for our needs 
class CustomDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.reported_jobs = dataframe.Reported_Jobs
        self.targets = self.data.Label
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reported_jobs)
    
    def __getitem__(self, index):
        reported_job = str(self.reported_jobs[index])
        reported_job = " ".join(reported_job.split())

        inputs = self.tokenizer.encode_plus(
            reported_job,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [29]:
# Running the tokenizer and shaping the dataframes for the model
train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
test_set = CustomDataset(test_df, tokenizer, MAX_LEN)

In [30]:
# Setting the train and test parameters 
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

training_loader = DataLoader(train_set, **train_params)
testing_loader = DataLoader(test_set, **test_params)

In [31]:
# Creatring the custom model

class BERTClass(torch.nn.Module):
    def __init__(self):
        # Defining the layers
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 22)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output_2 = self.l2(output_1)
        output_3 = self.l3(output_2)
        return output_3

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [32]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

# Fine tuning

In [33]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss= loss_fn(outputs, targets)
        if _%100==0:
            print(f'Epoch : {epoch}, Loss: {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return outputs

In [34]:
for epoch in range(EPOCHS):
    output = train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2it [00:00, 15.31it/s]

Epoch : 0, Loss: 0.6634927988052368


102it [00:07, 13.29it/s]

Epoch : 0, Loss: 0.19764165580272675


202it [00:15, 13.23it/s]

Epoch : 0, Loss: 0.18400567770004272


302it [00:22, 13.12it/s]

Epoch : 0, Loss: 0.14199092984199524


402it [00:30, 13.09it/s]

Epoch : 0, Loss: 0.15057986974716187


502it [00:37, 13.28it/s]

Epoch : 0, Loss: 0.16656868159770966


602it [00:45, 13.31it/s]

Epoch : 0, Loss: 0.1349906027317047


702it [00:53, 13.29it/s]

Epoch : 0, Loss: 0.1142592802643776


802it [01:00, 13.30it/s]

Epoch : 0, Loss: 0.10635222494602203


902it [01:08, 13.32it/s]

Epoch : 0, Loss: 0.150291308760643


1002it [01:15, 13.33it/s]

Epoch : 0, Loss: 0.1361927092075348


1102it [01:23, 13.13it/s]

Epoch : 0, Loss: 0.1488378793001175


1202it [01:30, 13.32it/s]

Epoch : 0, Loss: 0.06311142444610596


1302it [01:38, 13.32it/s]

Epoch : 0, Loss: 0.07951503992080688


1402it [01:45, 13.33it/s]

Epoch : 0, Loss: 0.08634477853775024


1502it [01:53, 13.33it/s]

Epoch : 0, Loss: 0.09040606021881104


1602it [02:00, 13.33it/s]

Epoch : 0, Loss: 0.11731011420488358


1702it [02:08, 13.34it/s]

Epoch : 0, Loss: 0.09628083556890488


1802it [02:15, 13.33it/s]

Epoch : 0, Loss: 0.07882272452116013


1902it [02:23, 13.30it/s]

Epoch : 0, Loss: 0.08972932398319244


2002it [02:30, 13.31it/s]

Epoch : 0, Loss: 0.10973211377859116


2102it [02:38, 13.37it/s]

Epoch : 0, Loss: 0.11038802564144135


2202it [02:45, 13.31it/s]

Epoch : 0, Loss: 0.08843209594488144


2302it [02:53, 13.21it/s]

Epoch : 0, Loss: 0.08011016994714737


2402it [03:00, 13.13it/s]

Epoch : 0, Loss: 0.0918244794011116


2502it [03:08, 13.21it/s]

Epoch : 0, Loss: 0.08836111426353455


2602it [03:16, 13.33it/s]

Epoch : 0, Loss: 0.0657690241932869


2702it [03:23, 13.34it/s]

Epoch : 0, Loss: 0.06963145732879639


2802it [03:31, 12.45it/s]

Epoch : 0, Loss: 0.07153002917766571


2902it [03:38, 13.25it/s]

Epoch : 0, Loss: 0.1447995901107788


3002it [03:46, 12.88it/s]

Epoch : 0, Loss: 0.08027133345603943


3102it [03:55, 12.61it/s]

Epoch : 0, Loss: 0.07327432930469513


3202it [04:02, 13.30it/s]

Epoch : 0, Loss: 0.08619658648967743


3302it [04:10, 13.31it/s]

Epoch : 0, Loss: 0.1048816591501236


3402it [04:17, 13.29it/s]

Epoch : 0, Loss: 0.05550092086195946


3502it [04:25, 13.33it/s]

Epoch : 0, Loss: 0.06305603682994843


3602it [04:32, 13.32it/s]

Epoch : 0, Loss: 0.09272664785385132


3702it [04:40, 13.34it/s]

Epoch : 0, Loss: 0.070523202419281


3802it [04:47, 13.26it/s]

Epoch : 0, Loss: 0.10749936103820801


3902it [04:55, 13.30it/s]

Epoch : 0, Loss: 0.06552918255329132


4002it [05:02, 13.31it/s]

Epoch : 0, Loss: 0.09755178540945053


4102it [05:10, 13.17it/s]

Epoch : 0, Loss: 0.03168101981282234


4202it [05:18, 13.29it/s]

Epoch : 0, Loss: 0.08361775428056717


4302it [05:25, 13.11it/s]

Epoch : 0, Loss: 0.06476639211177826


4402it [05:33, 13.04it/s]

Epoch : 0, Loss: 0.027673738077282906


4502it [05:40, 13.16it/s]

Epoch : 0, Loss: 0.09465958923101425


4602it [05:48, 13.15it/s]

Epoch : 0, Loss: 0.04882243275642395


4702it [05:56, 13.10it/s]

Epoch : 0, Loss: 0.0631597489118576


4802it [06:03, 13.31it/s]

Epoch : 0, Loss: 0.06587743759155273


4902it [06:11, 13.14it/s]

Epoch : 0, Loss: 0.08152876049280167


5002it [06:18, 13.29it/s]

Epoch : 0, Loss: 0.01822485588490963


5102it [06:26, 13.30it/s]

Epoch : 0, Loss: 0.04536651074886322


5202it [06:33, 13.35it/s]

Epoch : 0, Loss: 0.07106725126504898


5302it [06:41, 13.33it/s]

Epoch : 0, Loss: 0.06724775582551956


5402it [06:49, 13.28it/s]

Epoch : 0, Loss: 0.04008665680885315


5484it [06:55, 13.21it/s]
2it [00:00, 14.58it/s]

Epoch : 1, Loss: 0.04924773797392845


102it [00:07, 13.27it/s]

Epoch : 1, Loss: 0.062795490026474


202it [00:15, 13.29it/s]

Epoch : 1, Loss: 0.13304029405117035


302it [00:22, 13.32it/s]

Epoch : 1, Loss: 0.09985455870628357


402it [00:30, 13.33it/s]

Epoch : 1, Loss: 0.05130503699183464


502it [00:37, 13.28it/s]

Epoch : 1, Loss: 0.030249347910284996


602it [00:45, 13.32it/s]

Epoch : 1, Loss: 0.11532994359731674


702it [00:52, 13.04it/s]

Epoch : 1, Loss: 0.05811220034956932


802it [01:00, 13.05it/s]

Epoch : 1, Loss: 0.09115012735128403


902it [01:08, 13.03it/s]

Epoch : 1, Loss: 0.055299073457717896


1002it [01:15, 13.10it/s]

Epoch : 1, Loss: 0.04809720069169998


1102it [01:23, 13.14it/s]

Epoch : 1, Loss: 0.05343729257583618


1202it [01:30, 13.11it/s]

Epoch : 1, Loss: 0.049687765538692474


1302it [01:38, 13.26it/s]

Epoch : 1, Loss: 0.023554587736725807


1402it [01:45, 13.30it/s]

Epoch : 1, Loss: 0.06044822558760643


1502it [01:53, 13.29it/s]

Epoch : 1, Loss: 0.06681652367115021


1602it [02:01, 12.15it/s]

Epoch : 1, Loss: 0.038833051919937134


1702it [02:09, 13.10it/s]

Epoch : 1, Loss: 0.07913795858621597


1802it [02:17, 12.44it/s]

Epoch : 1, Loss: 0.0737907886505127


1902it [02:25, 12.30it/s]

Epoch : 1, Loss: 0.05287931486964226


2002it [02:33, 13.32it/s]

Epoch : 1, Loss: 0.04593005031347275


2102it [02:40, 13.30it/s]

Epoch : 1, Loss: 0.04462338984012604


2202it [02:48, 12.25it/s]

Epoch : 1, Loss: 0.06484127044677734


2302it [02:56, 12.30it/s]

Epoch : 1, Loss: 0.07709550857543945


2402it [03:04, 12.34it/s]

Epoch : 1, Loss: 0.03385820984840393


2502it [03:13, 11.73it/s]

Epoch : 1, Loss: 0.031482256948947906


2602it [03:20, 13.34it/s]

Epoch : 1, Loss: 0.01778525300323963


2702it [03:28, 13.00it/s]

Epoch : 1, Loss: 0.05516016483306885


2802it [03:35, 12.68it/s]

Epoch : 1, Loss: 0.04842488840222359


2902it [03:43, 13.05it/s]

Epoch : 1, Loss: 0.021140769124031067


3002it [03:51, 13.16it/s]

Epoch : 1, Loss: 0.05492555722594261


3102it [03:59, 13.13it/s]

Epoch : 1, Loss: 0.06622786074876785


3202it [04:06, 13.07it/s]

Epoch : 1, Loss: 0.029018234461545944


3302it [04:14, 13.09it/s]

Epoch : 1, Loss: 0.05307731032371521


3402it [04:21, 13.31it/s]

Epoch : 1, Loss: 0.06496595591306686


3502it [04:29, 13.30it/s]

Epoch : 1, Loss: 0.1029634103178978


3602it [04:36, 13.32it/s]

Epoch : 1, Loss: 0.05024607107043266


3702it [04:44, 13.30it/s]

Epoch : 1, Loss: 0.0730392336845398


3802it [04:51, 13.27it/s]

Epoch : 1, Loss: 0.060949087142944336


3902it [04:59, 13.15it/s]

Epoch : 1, Loss: 0.05339585244655609


4002it [05:07, 13.14it/s]

Epoch : 1, Loss: 0.05356923118233681


4102it [05:14, 13.34it/s]

Epoch : 1, Loss: 0.04840344935655594


4202it [05:22, 13.24it/s]

Epoch : 1, Loss: 0.10917813330888748


4302it [05:29, 13.28it/s]

Epoch : 1, Loss: 0.11762547492980957


4402it [05:37, 13.32it/s]

Epoch : 1, Loss: 0.12929020822048187


4502it [05:44, 13.05it/s]

Epoch : 1, Loss: 0.06565862894058228


4602it [05:52, 13.30it/s]

Epoch : 1, Loss: 0.027881931513547897


4702it [06:00, 13.09it/s]

Epoch : 1, Loss: 0.041904229670763016


4802it [06:07, 13.10it/s]

Epoch : 1, Loss: 0.027387211099267006


4902it [06:15, 13.27it/s]

Epoch : 1, Loss: 0.011951339431107044


5002it [06:22, 13.11it/s]

Epoch : 1, Loss: 0.07352869212627411


5102it [06:30, 13.14it/s]

Epoch : 1, Loss: 0.025423817336559296


5202it [06:37, 13.14it/s]

Epoch : 1, Loss: 0.06768292188644409


5302it [06:45, 13.29it/s]

Epoch : 1, Loss: 0.05739228054881096


5402it [06:53, 13.23it/s]

Epoch : 1, Loss: 0.06424352526664734


5484it [06:59, 13.08it/s]
2it [00:00, 14.85it/s]

Epoch : 2, Loss: 0.08193192631006241


102it [00:07, 13.25it/s]

Epoch : 2, Loss: 0.05843932181596756


202it [00:15, 13.27it/s]

Epoch : 2, Loss: 0.014417561702430248


302it [00:22, 13.28it/s]

Epoch : 2, Loss: 0.047137271612882614


402it [00:30, 12.66it/s]

Epoch : 2, Loss: 0.028776351362466812


502it [00:38, 12.40it/s]

Epoch : 2, Loss: 0.0800459161400795


602it [00:46, 12.48it/s]

Epoch : 2, Loss: 0.009435364045202732


702it [00:54, 12.47it/s]

Epoch : 2, Loss: 0.055853649973869324


802it [01:01, 12.77it/s]

Epoch : 2, Loss: 0.05130498856306076


902it [01:09, 13.00it/s]

Epoch : 2, Loss: 0.06954609602689743


1002it [01:17, 13.05it/s]

Epoch : 2, Loss: 0.0648631602525711


1102it [01:24, 13.32it/s]

Epoch : 2, Loss: 0.02471836656332016


1202it [01:32, 13.31it/s]

Epoch : 2, Loss: 0.021647831425070763


1302it [01:39, 13.28it/s]

Epoch : 2, Loss: 0.023514436557888985


1402it [01:47, 13.31it/s]

Epoch : 2, Loss: 0.011443103663623333


1502it [01:54, 13.25it/s]

Epoch : 2, Loss: 0.060272157192230225


1602it [02:02, 13.30it/s]

Epoch : 2, Loss: 0.040909767150878906


1702it [02:09, 13.08it/s]

Epoch : 2, Loss: 0.017021378502249718


1802it [02:17, 13.09it/s]

Epoch : 2, Loss: 0.049674950540065765


1902it [02:24, 13.28it/s]

Epoch : 2, Loss: 0.024856774136424065


2002it [02:32, 13.26it/s]

Epoch : 2, Loss: 0.07459217309951782


2102it [02:40, 13.22it/s]

Epoch : 2, Loss: 0.02574397623538971


2202it [02:47, 13.31it/s]

Epoch : 2, Loss: 0.03134046867489815


2302it [02:55, 13.26it/s]

Epoch : 2, Loss: 0.10928869247436523


2402it [03:02, 13.28it/s]

Epoch : 2, Loss: 0.063200443983078


2502it [03:10, 13.32it/s]

Epoch : 2, Loss: 0.06425142288208008


2602it [03:17, 13.17it/s]

Epoch : 2, Loss: 0.026841090992093086


2702it [03:25, 13.31it/s]

Epoch : 2, Loss: 0.053269386291503906


2802it [03:32, 13.31it/s]

Epoch : 2, Loss: 0.06392444670200348


2902it [03:40, 13.32it/s]

Epoch : 2, Loss: 0.017824076116085052


3002it [03:47, 13.17it/s]

Epoch : 2, Loss: 0.07409612834453583


3102it [03:55, 12.47it/s]

Epoch : 2, Loss: 0.044575996696949005


3202it [04:03, 13.28it/s]

Epoch : 2, Loss: 0.039437416940927505


3302it [04:10, 13.31it/s]

Epoch : 2, Loss: 0.015836454927921295


3402it [04:18, 13.09it/s]

Epoch : 2, Loss: 0.06989794969558716


3502it [04:25, 13.31it/s]

Epoch : 2, Loss: 0.03145461156964302


3602it [04:33, 13.14it/s]

Epoch : 2, Loss: 0.016390638425946236


3702it [04:40, 13.15it/s]

Epoch : 2, Loss: 0.015598487108945847


3802it [04:48, 13.06it/s]

Epoch : 2, Loss: 0.010547186248004436


3902it [04:56, 13.09it/s]

Epoch : 2, Loss: 0.04370013251900673


4002it [05:03, 13.08it/s]

Epoch : 2, Loss: 0.01214774139225483


4102it [05:11, 13.28it/s]

Epoch : 2, Loss: 0.03714112937450409


4202it [05:18, 13.12it/s]

Epoch : 2, Loss: 0.05890059471130371


4302it [05:26, 13.08it/s]

Epoch : 2, Loss: 0.0467732772231102


4402it [05:33, 13.30it/s]

Epoch : 2, Loss: 0.04640306159853935


4502it [05:41, 13.10it/s]

Epoch : 2, Loss: 0.05314665287733078


4602it [05:48, 13.26it/s]

Epoch : 2, Loss: 0.052726805210113525


4702it [05:56, 13.27it/s]

Epoch : 2, Loss: 0.07322269678115845


4802it [06:04, 13.28it/s]

Epoch : 2, Loss: 0.00795777328312397


4902it [06:11, 13.33it/s]

Epoch : 2, Loss: 0.07730264961719513


5002it [06:19, 13.26it/s]

Epoch : 2, Loss: 0.03518269956111908


5102it [06:26, 13.08it/s]

Epoch : 2, Loss: 0.05170164629817009


5202it [06:34, 13.30it/s]

Epoch : 2, Loss: 0.06267107278108597


5302it [06:41, 13.10it/s]

Epoch : 2, Loss: 0.013644343242049217


5402it [06:49, 13.30it/s]

Epoch : 2, Loss: 0.01927797868847847


5484it [06:55, 13.20it/s]
2it [00:00, 15.02it/s]

Epoch : 3, Loss: 0.06588268280029297


102it [00:07, 13.03it/s]

Epoch : 3, Loss: 0.026941269636154175


202it [00:15, 13.24it/s]

Epoch : 3, Loss: 0.053513627499341965


302it [00:22, 13.23it/s]

Epoch : 3, Loss: 0.006062511820346117


402it [00:30, 13.04it/s]

Epoch : 3, Loss: 0.02298521250486374


502it [00:37, 13.33it/s]

Epoch : 3, Loss: 0.05940799042582512


602it [00:45, 13.07it/s]

Epoch : 3, Loss: 0.010284987278282642


702it [00:53, 13.25it/s]

Epoch : 3, Loss: 0.01613910123705864


802it [01:00, 13.12it/s]

Epoch : 3, Loss: 0.026185808703303337


902it [01:08, 13.30it/s]

Epoch : 3, Loss: 0.07059203088283539


1002it [01:15, 13.26it/s]

Epoch : 3, Loss: 0.012322809547185898


1102it [01:23, 13.28it/s]

Epoch : 3, Loss: 0.009825209155678749


1202it [01:30, 13.30it/s]

Epoch : 3, Loss: 0.015467437915503979


1302it [01:38, 13.28it/s]

Epoch : 3, Loss: 0.052045077085494995


1402it [01:45, 13.28it/s]

Epoch : 3, Loss: 0.03669073432683945


1502it [01:53, 13.14it/s]

Epoch : 3, Loss: 0.035183385014534


1602it [02:00, 13.30it/s]

Epoch : 3, Loss: 0.08310669660568237


1702it [02:08, 13.10it/s]

Epoch : 3, Loss: 0.018235821276903152


1802it [02:16, 13.26it/s]

Epoch : 3, Loss: 0.057915493845939636


1902it [02:23, 13.02it/s]

Epoch : 3, Loss: 0.02274622954428196


2002it [02:31, 13.33it/s]

Epoch : 3, Loss: 0.019090572372078896


2102it [02:38, 13.29it/s]

Epoch : 3, Loss: 0.034289371222257614


2202it [02:46, 13.13it/s]

Epoch : 3, Loss: 0.032043635845184326


2302it [02:53, 13.31it/s]

Epoch : 3, Loss: 0.02566584199666977


2402it [03:01, 13.14it/s]

Epoch : 3, Loss: 0.03741708770394325


2502it [03:08, 13.05it/s]

Epoch : 3, Loss: 0.011112312786281109


2602it [03:16, 13.13it/s]

Epoch : 3, Loss: 0.037040241062641144


2702it [03:24, 13.11it/s]

Epoch : 3, Loss: 0.018980104476213455


2802it [03:31, 13.14it/s]

Epoch : 3, Loss: 0.0539267398416996


2902it [03:39, 13.33it/s]

Epoch : 3, Loss: 0.014118297956883907


3002it [03:46, 13.29it/s]

Epoch : 3, Loss: 0.02950870245695114


3102it [03:54, 13.06it/s]

Epoch : 3, Loss: 0.006875202991068363


3202it [04:01, 13.27it/s]

Epoch : 3, Loss: 0.05936124920845032


3302it [04:09, 13.03it/s]

Epoch : 3, Loss: 0.022002940997481346


3402it [04:17, 13.28it/s]

Epoch : 3, Loss: 0.03817286342382431


3502it [04:24, 13.01it/s]

Epoch : 3, Loss: 0.012152718380093575


3602it [04:32, 13.26it/s]

Epoch : 3, Loss: 0.016487406566739082


3702it [04:39, 12.98it/s]

Epoch : 3, Loss: 0.020735107362270355


3802it [04:47, 13.09it/s]

Epoch : 3, Loss: 0.023199550807476044


3902it [04:55, 13.08it/s]

Epoch : 3, Loss: 0.01248209923505783


4002it [05:02, 13.08it/s]

Epoch : 3, Loss: 0.01541156880557537


4102it [05:10, 13.08it/s]

Epoch : 3, Loss: 0.037182390689849854


4202it [05:17, 13.10it/s]

Epoch : 3, Loss: 0.06321673095226288


4302it [05:25, 13.10it/s]

Epoch : 3, Loss: 0.03101930394768715


4402it [05:33, 13.31it/s]

Epoch : 3, Loss: 0.02715938352048397


4502it [05:40, 13.19it/s]

Epoch : 3, Loss: 0.043397556990385056


4602it [05:48, 13.10it/s]

Epoch : 3, Loss: 0.015942217782139778


4702it [05:55, 13.06it/s]

Epoch : 3, Loss: 0.008656524121761322


4802it [06:03, 13.07it/s]

Epoch : 3, Loss: 0.039046380668878555


4902it [06:11, 13.15it/s]

Epoch : 3, Loss: 0.02085055038332939


5002it [06:18, 13.26it/s]

Epoch : 3, Loss: 0.07127229124307632


5102it [06:26, 13.10it/s]

Epoch : 3, Loss: 0.026379968971014023


5202it [06:33, 13.28it/s]

Epoch : 3, Loss: 0.05620346963405609


5302it [06:41, 13.27it/s]

Epoch : 3, Loss: 0.017510447651147842


5402it [06:48, 13.06it/s]

Epoch : 3, Loss: 0.0195065438747406


5484it [06:55, 13.21it/s]
2it [00:00, 15.25it/s]

Epoch : 4, Loss: 0.06314530223608017


102it [00:07, 13.19it/s]

Epoch : 4, Loss: 0.034150950610637665


202it [00:15, 13.08it/s]

Epoch : 4, Loss: 0.026937859132885933


302it [00:22, 13.25it/s]

Epoch : 4, Loss: 0.01976567879319191


402it [00:30, 13.07it/s]

Epoch : 4, Loss: 0.007850650697946548


502it [00:38, 13.11it/s]

Epoch : 4, Loss: 0.01869293861091137


602it [00:45, 13.29it/s]

Epoch : 4, Loss: 0.008480601944029331


702it [00:53, 13.09it/s]

Epoch : 4, Loss: 0.046954818069934845


802it [01:00, 13.05it/s]

Epoch : 4, Loss: 0.004620221443474293


902it [01:08, 13.09it/s]

Epoch : 4, Loss: 0.009699234738945961


1002it [01:16, 13.09it/s]

Epoch : 4, Loss: 0.02105383761227131


1102it [01:23, 13.21it/s]

Epoch : 4, Loss: 0.062054116278886795


1202it [01:31, 13.12it/s]

Epoch : 4, Loss: 0.008098130114376545


1302it [01:38, 13.09it/s]

Epoch : 4, Loss: 0.013215749524533749


1402it [01:46, 13.14it/s]

Epoch : 4, Loss: 0.015684137120842934


1502it [01:54, 13.08it/s]

Epoch : 4, Loss: 0.0067990850657224655


1602it [02:01, 13.26it/s]

Epoch : 4, Loss: 0.01569589599967003


1702it [02:09, 13.13it/s]

Epoch : 4, Loss: 0.005225274711847305


1802it [02:16, 13.25it/s]

Epoch : 4, Loss: 0.0033890216145664454


1902it [02:24, 13.22it/s]

Epoch : 4, Loss: 0.011778959073126316


2002it [02:32, 13.08it/s]

Epoch : 4, Loss: 0.043935950845479965


2102it [02:39, 13.11it/s]

Epoch : 4, Loss: 0.0070620812475681305


2202it [02:47, 13.10it/s]

Epoch : 4, Loss: 0.07341580092906952


2302it [02:54, 13.11it/s]

Epoch : 4, Loss: 0.007743626367300749


2402it [03:02, 13.09it/s]

Epoch : 4, Loss: 0.03962857276201248


2502it [03:10, 13.31it/s]

Epoch : 4, Loss: 0.0708255022764206


2602it [03:17, 13.20it/s]

Epoch : 4, Loss: 0.013344784267246723


2702it [03:25, 13.12it/s]

Epoch : 4, Loss: 0.02106647752225399


2802it [03:32, 13.10it/s]

Epoch : 4, Loss: 0.009864330291748047


2902it [03:40, 13.11it/s]

Epoch : 4, Loss: 0.0630195140838623


3002it [03:47, 13.12it/s]

Epoch : 4, Loss: 0.004723868798464537


3102it [03:55, 14.12it/s]

Epoch : 4, Loss: 0.044148288667201996


3202it [04:02, 14.27it/s]

Epoch : 4, Loss: 0.03824976086616516


3302it [04:09, 14.19it/s]

Epoch : 4, Loss: 0.050064314156770706


3402it [04:16, 14.21it/s]

Epoch : 4, Loss: 0.019938353449106216


3502it [04:23, 14.08it/s]

Epoch : 4, Loss: 0.028687572106719017


3602it [04:30, 14.26it/s]

Epoch : 4, Loss: 0.08491778373718262


3702it [04:37, 14.27it/s]

Epoch : 4, Loss: 0.009821506217122078


3802it [04:44, 14.12it/s]

Epoch : 4, Loss: 0.011266125366091728


3902it [04:51, 14.26it/s]

Epoch : 4, Loss: 0.06196602061390877


4002it [04:58, 14.28it/s]

Epoch : 4, Loss: 0.012053631246089935


4102it [05:05, 14.19it/s]

Epoch : 4, Loss: 0.020031943917274475


4202it [05:12, 14.12it/s]

Epoch : 4, Loss: 0.011117358691990376


4302it [05:19, 14.26it/s]

Epoch : 4, Loss: 0.023980045691132545


4402it [05:26, 14.24it/s]

Epoch : 4, Loss: 0.055968448519706726


4502it [05:33, 14.09it/s]

Epoch : 4, Loss: 0.05129055678844452


4602it [05:40, 14.29it/s]

Epoch : 4, Loss: 0.005749725736677647


4702it [05:47, 14.21it/s]

Epoch : 4, Loss: 0.04149284213781357


4802it [05:54, 14.19it/s]

Epoch : 4, Loss: 0.0043631489388644695


4902it [06:01, 14.17it/s]

Epoch : 4, Loss: 0.05338739603757858


5002it [06:08, 14.26it/s]

Epoch : 4, Loss: 0.01885504275560379


5102it [06:15, 14.26it/s]

Epoch : 4, Loss: 0.005383392795920372


5202it [06:22, 14.15it/s]

Epoch : 4, Loss: 0.048776041716337204


5302it [06:29, 14.23it/s]

Epoch : 4, Loss: 0.006958618760108948


5402it [06:36, 14.19it/s]

Epoch : 4, Loss: 0.03368891030550003


5484it [06:42, 13.62it/s]


## Model Evaluation and testing 

In [35]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _,data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [36]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

4701it [01:00, 77.26it/s]


Accuracy Score = 0.7612488033187959
F1 Score (Micro) = 0.7855493684639414
F1 Score (Macro) = 0.7629568558609513


4701it [01:00, 78.28it/s]


Accuracy Score = 0.7612488033187959
F1 Score (Micro) = 0.7855493684639414
F1 Score (Macro) = 0.7629568558609513


4701it [01:04, 73.26it/s]


Accuracy Score = 0.7612488033187959
F1 Score (Micro) = 0.7855493684639414
F1 Score (Macro) = 0.7629568558609513


4701it [01:02, 74.90it/s]


Accuracy Score = 0.7612488033187959
F1 Score (Micro) = 0.7855493684639414
F1 Score (Macro) = 0.7629568558609513


4701it [01:02, 75.10it/s]


Accuracy Score = 0.7612488033187959
F1 Score (Micro) = 0.7855493684639414
F1 Score (Macro) = 0.7629568558609513


In [37]:
def predict(input, tokenizer, model, device):
    ''' GOAL OF THIS FUNCTION: 
    This function takes in any given string and converts it into a tokenized version that can be run through the model. '''

    input = " ".join(input.split())
    inputs = tokenizer.encode_plus(
        input,
        None,
        add_special_tokens=True,
        max_length=175,
        pad_to_max_length=True,
        return_token_type_ids=True
    )
    
    ids = torch.tensor(inputs['input_ids'], dtype=torch.long).to(device, dtype = torch.long)
    mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).to(device, dtype= torch.long)
    token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long).to(device, dtype= torch.long)

    output = model(ids.unsqueeze(0), mask.unsqueeze(0), token_type_ids.unsqueeze(0))

    return output

## Test Predictions with the model 

In [55]:
# To predict, the model is looking for the ids, mask and token_type_ids. Which means whatever we input must be run through the tokenizer first. 
input='Data Engineer'
output = predict(input, tokenizer, model, device)



In [56]:
output_df = pd.DataFrame(output.cpu().detach().numpy().tolist(), columns=label_df.columns)

In [57]:
output_df

Unnamed: 0,11,13,15,17,19,21,23,25,27,29,...,35,37,39,41,43,45,47,49,51,53
0,-6.63002,-5.559723,1.656716,-1.71724,-5.197885,-8.343866,-8.628674,-7.443535,-5.289961,-6.74632,...,-8.617933,-8.822206,-8.15898,-7.410257,-4.932382,-7.290226,-6.880899,-5.678417,-5.013235,-5.465094


In [58]:
# Checking the prediction, which is the value with the highest value. 
output_df.idxmax(axis='columns')

0    15
dtype: object

## MODEL TRAINING PART TWO
Now we train a model that will predict the specific job that this title belongs too. This will have to be 22 indidvidual models that will run based on the group that is chosen. I'm a little worried that the performance will not be as strong in comparison of the main model given that the data for each major group is much smaller in comparison to the main dataset 