# PURPOSE OF THIS NOTEBOOK 

This notebook takes the data needed for the individual 22 models and trains all the models to predict which job the titles belong to once the first model predicts the job family they belong to. There will be specific challenges I will need to look through, one of these specifically being that I need to figure how to benchmark these models. 

In [24]:
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
from transformers import BertTokenizer, BertModel, BertConfig
from pathlib import Path
from torch import cuda
import torch

from CommonFunctions import CustomDataset, BERTClass
from sklearn import metrics 
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [25]:
# Setting up for GPU 

device = 'cuda' if cuda.is_available() else 'cpu'

## Load in the tokenizer

In [26]:
# Load in the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [27]:
# Set the parmeters of the models 
MAX_LEN = 175
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 2e-5

# Training the models 

Just like generating the data, I would need to create a for loop that will call and train the models based on what I need to do. 

In [28]:
# import the training/testing data
test_df = pd.read_csv("../Data/MajorGroupTrainTestSplit/ONET_11_test_df.csv")
train_df = pd.read_csv("../Data/MajorGroupTrainTestSplit/ONET_11_train_df.csv")
label_df = pd.read_csv("../Data/label_df.csv")
onet_group_df = pd.read_csv("../Data/ONET_Group_list.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../Data/MajorGroupTrainTestSplit/ONET_11_test_df.csv'

In [None]:
onet_group_list = onet_group_df['# ONET_Group'].to_list()

In [30]:
# Making sure that the label column is actual lists instead of strings of lists
train_df['Label'] = train_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])
test_df['Label'] = test_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

In [None]:
# Define the parameters of the data
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

## Create the loop to train the data in bulk 

In [13]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss= loss_fn(outputs, targets)
        if _%100==0:
            print(f'Epoch : {epoch}, Loss: {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return outputs

In [14]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _,data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
# Start the for loop
for group in onet_group_list:
    # Import the data
    train_df = pd.read_csv(f"../Data/MajorGroupTrainTestSplit/ONET_{group}_train_df.csv")
    test_df = pd.read_csv(f"../Data/MajorGroupTrainTestSplit/ONET_{group}_test_df.csv")

    # Ensuring that the label columns contain lists not strings of lists
    train_df['Label'] = train_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])
    test_df['Label'] = test_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

    # transform the training and testing data into the datasets needed for training 
    train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
    test_set = CustomDataset(test_df, tokenizer, MAX_LEN)

    # Run the DataLoader on the train/test set pair
    training_loader = DataLoader(train_set, **train_params)
    testing_loader = DataLoader(test_set, **test_params)

    # load in the model
    model = BERTClass(len=train_df.Label.str.len()[0])
    model.to(device)

    # load in the optimizer
    optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
    
    for epoch in range(EPOCHS):
        output = train(epoch)

    for epoch in range(EPOCHS):
        outputs, targets = validation(epoch)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
        f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
        print(f"Accuracy Score = {accuracy}")
        print(f"F1 Score (Micro) = {f1_score_micro}")
        print(f"F1 Score (Macro) = {f1_score_macro}")

    # Lastly save the model for later testing. 
    torch.save(model, f'../Data/Models/ONET_Group_{group}_Model')

In [None]:
def predict(input, tokenizer, model, device):
    ''' GOAL OF THIS FUNCTION: 
    This function takes in any given string and converts it into a tokenized version that can be run through the model. '''

    input = " ".join(input.split())
    inputs = tokenizer.encode_plus(
        input,
        None,
        add_special_tokens=True,
        max_length=175,
        pad_to_max_length=True,
        return_token_type_ids=True
    )
    
    ids = torch.tensor(inputs['input_ids'], dtype=torch.long).to(device, dtype = torch.long)
    mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).to(device, dtype= torch.long)
    token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long).to(device, dtype= torch.long)

    output = model(ids.unsqueeze(0), mask.unsqueeze(0), token_type_ids.unsqueeze(0))

    return output

In [None]:
# Test code for the for loop
test_df = pd.read_csv("../Data/MajorGroupTrainTestSplit/ONET_11_test_df.csv")
train_df = pd.read_csv("../Data/MajorGroupTrainTestSplit/ONET_11_train_df.csv")

train_df['Label'] = train_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])
test_df['Label'] = test_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

