# PURPOSE OF THIS NOTEBOOK
This is the base code to write the functions to extract the raw text from resumes and append to our dataframe. 

TODO: Take the functions into a .py folder and use it as a script 

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from tokenizers import ByteLevelBPETokenizer
from transformers import BertTokenizer, BertModel
from pathlib import Path
from torch import cuda
import torch

from sklearn import metrics
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setting up for GPU 

device = 'cuda' if cuda.is_available() else 'cpu'

## Model Goal

The goal is to train a custom BERT model to attempt to label each incoming job title to their proper ONET code. ONET (Occupational Information Network) is a free database that serves as a standardized taxonomy for jobs. Each job has a respective standardized name and code associated with it. This will make extracting skills quite easy if we can corrrectly translate job titles to the proper ONET code. Using BERT, we can tokenize the job titles and match them with the database of common job titles for each ONET code. For this first version I will be leaving out the actual ONET job names from the training data to compare later with an updated dataset. More information about ONET can be found here: https://www.onetonline.org/

In [3]:
# Import the train/test Data. 
test_df = pd.read_csv("../Data/TestingData.csv")
train_df = pd.read_csv("../Data/Training_Data.csv")

In [4]:
# Check that the test data incoming is correct
test_df.head()

Unnamed: 0,Reported_Jobs,Label
0,Chief Diversity Officer (CDO),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Operations Vice President (Operations VP),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Agricultural Services Director,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Bureau Chief,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Business Development Executive,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
# Check that the training data incoming is correct 
train_df.head()

Unnamed: 0,Reported_Jobs,Label
0,Road Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Tax Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Deputy Insurance Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,School Commissioner,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Aeronautics Commission Director,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
train_df['Label'] = train_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

In [7]:
test_df['Label'] = test_df['Label'].apply(lambda s: [float(x.strip(' []')) for x in s.split(',')])

## The goal of the tokenizer

The tokenizer will be tokenizing the job titles and the reported job titles, I don't know if I should do these seperately or together. In theory, I should have these pairings be tokenized together. **I SHOULD READ INTO THE TOKENIZER TO UNDERSTAND HOW I SHOULD APPROACH THIS** 

From looking at the reference code, I've learned that we need to follow these steps: 
1. Start with a train test split. **70% for the training data**, I will do the split based on the **70%** of each reported job title/ONET pairing.
    - Given the refernece notebook uses a dictionary as the input data and we are working with a dataframe instead, some major changes will be needed to be made in order for this to work. I don't think this would be difficult at all. Just need to translate the dictionary work to the dataframe. **I also need to confirm if the model input requires a list, dict, or dataframe object.**
2. Run the tokenizer on the training set 
3. Set up the model training and evaluation metrics.


In [8]:
# Load in the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

In [10]:
# Start by tokenizing the data.
# Will be using the class statment and slowly converting it for our needs 
class CustomDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.reported_jobs = dataframe.Reported_Jobs
        self.targets = self.data.Label
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reported_jobs)
    
    def __getitem__(self, index):
        # This can be done with a for loop 
        reported_job = str(self.reported_jobs[index])
        reported_job = " ".join(reported_job.split())

        inputs = self.tokenizer.encode_plus(
            reported_job,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [11]:
# Running the tokenizer and shaping the dataframes for the model
train_set = CustomDataset(train_df, tokenizer, max_len=200)
test_set = CustomDataset(test_df, tokenizer, max_len=200)

In [12]:
# Setting the train and test parameters 
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

training_loader = DataLoader(train_set, **train_params)
testing_loader = DataLoader(test_set, **test_params)

In [13]:
# Creatring the custom model

class BERTClass(torch.nn.Module):
    def __init__(self):
        # Defining the layers
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output_2 = self.l2(output_1)
        output_3 = self.l3(output_2)
        return output_3

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [14]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

# Fine tuning

In [15]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss= loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch : {epoch}, Loss: {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [16]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TypeError: dropout(): argument 'input' (position 1) must be Tensor, not str

In [5]:
# This is text extraction, Code will be deprecated for now until I find use for it later. 
text_data = []
file_count=0

for sample in tqdm(train_df['Reported_Jobs']):
    text_data.append(sample)
    if len(text_data) == 10_000:
        # once we git the 100K mark, save to file
        with open(f'../Data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

100%|██████████| 31166/31166 [00:00<00:00, 4777591.41it/s]


In [6]:
# Created the data for the tokenizer, now to 

paths = [str(x) for x in Path('../Data/').glob('**/*.txt')]

tokenizer = ByteLevelBPETokenizer()

In [7]:
# Train the tokenizer

tokenizer.train(files=paths[:5], vocab_size=30_522, min_frequency=2, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

# Save the tokenizer 
tokenizer.save_model(directory="../Data")






['../Data/vocab.json', '../Data/merges.txt']

In [8]:
from transformers import RobertaTokenizer

# initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('../Data', max_len=512)

## The validation data. 

Since this dataset that I'm using to test does not have ONET codes attached to them, I would have to source another dataset that does. Thankfully ONET does have such data that I can use for training. For each ONET code there's common job names appended to them, I could use that data with the pretained model. That being said this creates some challenges. 
1. For obsure and odd job names that are not listed in the common job names will almost guarentee bad performance. Given that the goal is to prove that we can get generally good performance with publicly available data, this is something I'm willing to sacrifice if it means that it will work correctly at least 80% of the time. 

I have sourced a table with common names linked with each onet code. I can use this data on the pretrained model to see how it classifies each of the common names linked with ONET codes. 

## Next steps

After loading in the pretrained model, I want to load the data into this model and test the performance while holding back around 30% of the data for testing. After I complete the testing phase of the model performance, I will start the next step and pull, or train a model or a script to pull the job titles from a resume and apply into the model to classify into an appropriote ONET code. 

In [9]:
def masking(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4
    return tensor

In [10]:
from tqdm.auto import tqdm 

input_ids = []
mask = []
labels = []

for path in tqdm(paths):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(masking(sample.input_ids.detach().clone()))

100%|██████████| 4/4 [00:04<00:00,  1.24s/it]


In [11]:
input_ids = torch.cat(input_ids)
labels = torch.cat(labels)
mask = torch.cat(mask)

In [12]:
encodings = {
    'input_ids' : input_ids,
    'attention_mask' : mask,
    'labels': labels
}

In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [14]:
dataset = Dataset(encodings)

In [15]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

## Train the Model



In [16]:
from transformers import RobertaConfig

In [17]:
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=3,
    type_vocab_size=1
)

In [18]:
from transformers import RobertaForMaskedLM

In [19]:
model = RobertaForMaskedLM(config)

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(13270, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-2): 3 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [22]:
from transformers import AdamW

In [23]:
optim = AdamW(model.parameters(), lr=1e-5)



In [26]:
epochs = 2

for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # extract loss
        loss = outputs.loss

        loss.backward()

        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


Epoch 0: 100%|██████████| 2689/2689 [22:51<00:00,  1.96it/s, loss=0.0245]
Epoch 1: 100%|██████████| 2689/2689 [22:40<00:00,  1.98it/s, loss=0.0198] 


In [27]:
model.save_pretrained('../Data')

## File mask testing

In [28]:
from transformers import pipeline 

fill = pipeline('fill-mask', model='../Data', tokenizer='../Data')

In [38]:
fill(f'Chief {fill.tokenizer.mask_token}')

[{'score': 0.022944310680031776,
  'token': 293,
  'token_str': ' Operator',
  'sequence': 'Chief Operator'},
 {'score': 0.02019713632762432,
  'token': 331,
  'token_str': ' Technician',
  'sequence': 'Chief Technician'},
 {'score': 0.008666190318763256,
  'token': 370,
  'token_str': ' Specialist',
  'sequence': 'Chief Specialist'},
 {'score': 0.007749341428279877,
  'token': 380,
  'token_str': ' Manager',
  'sequence': 'Chief Manager'},
 {'score': 0.0073532359674572945,
  'token': 359,
  'token_str': ' Supervisor',
  'sequence': 'Chief Supervisor'}]