In [None]:
!pip install transformers

from transformers import TFBertModel,  BertConfig, BertTokenizerFast
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize

from bs4 import BeautifulSoup
import plotly.graph_objs as go
import plotly as py

# And pandas for data import + sklearn because you allways need sklearn
from sklearn.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 40.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [None]:
#mount my drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#load in my already clean dataset 
import pandas as pd
df = pd.read_csv('gdrive/MyDrive/PersonalReason.csv')

In [None]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

**LEBELLING THE CLASSES**

In [None]:
possible_labels = df.Reason.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Completed': 0,
 'Other personal reasons & dropped out': 1,
 'Other': 2,
 'Gone into employment': 3,
 'Health reasons': 4,
 'Transferred to another provider': 5,
 'Academic failure/left in bad standing/not permitted to progress': 6,
 'Financial reasons': 7,
 'Exclusion': 8}

In [None]:
df['label'] = df.Reason.replace(label_dict)

**SPLITTING AND TRAINING**

Train and Validation Split
Because the labels are imbalanced, we split the data set in a stratified fashion, using this as the class labels.





In [None]:
#splited my data into trianing and testing data, 20% was used for testing
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, y_test = train_test_split(df.index.values,df.label.values,test_size=0.20, random_state=42)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

After the split, label distribution will look like this

In [None]:
#grouped data according to their labels and training type.
df.groupby(['Reason', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PersonalStatement
Reason,label,data_type,Unnamed: 3_level_1
Academic failure/left in bad standing/not permitted to progress,6,train,7
Academic failure/left in bad standing/not permitted to progress,6,val,1
Completed,0,train,3837
Completed,0,val,695
Exclusion,8,train,1
Exclusion,8,val,1
Financial reasons,7,train,5
Gone into employment,3,train,24
Gone into employment,3,val,2
Health reasons,4,train,20


**FEATURE EXTRATION AND ENCODING USING BERT**

The process of tokenization divides raw texts into tokens, which are numerical values that represent words.

**SUMARY OF THE CODING PROCESS BELOW**

Building a BERT tokenizer. according to WordPiece.

Create a BERT model configuration that has already been trained to encode our data.

We utilise a function called batch_encode_plus to encode all of the titles from text, and we process the training and validation sets of data individually.

The title text is the first parameter in the aforementioned function.

The sequences will be encoded with the special tokens specific to their model if add_special_tokens=True.

We set return_attention_mask=True while batching sequences so that it will return the attention mask in accordance with the particular tokenizer specified by the max_length attribute.

Additionally, we wish to lengthen each title to a certain limit.

Additionally, we lengthened each title to a certain limit.
In reality, we don't even need to set max length=200; we're simply being cautious.
To return PyTorch, type return tensors='pt'.
The data must then be divided into input_ids, attention_masks, and labels.
We can finally construct training data and validation data once we have the encoded data set.



To return PyTorch, type return tensors='pt'.

The data must then be divided into input ids, attention masks, and labels.

We can then do training  and validation  once we have the encoded data set.

In [None]:
#Berttokenizer and BertForMaskedLM from transformers
from transformers import BertTokenizer, BertForMaskedLM

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].PersonalStatement.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=200, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].PersonalStatement.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=200, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
len(dataset_train), len(dataset_val)


(4080, 721)

**MODELLING**

Pre-trained BERT Model
Since each title is treated as having a distinct sequence, each sequence will be assigned to one of the 8 labels.

A more compact pre-trained model is called bert-base-uncased.
num_labels is used to specify the quantity of output labels.
Output attentions are not really important to us.


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

**DATA LOADERS**

 A dataset and a sampler are combined by DataLoader, which then offers an iterable over the supplied dataset.
SequentialSampler is used for validation whereas RandomSampler is used for training.
I set batch size=20

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 20

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [None]:
#added some hyperparameters
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)



**Scheduler & Optimizer**

An iterable holding the parameters to optimise must be provided to an optimizer in order to construct one. The learning rate, epsilon, and other settings specific to the optimizer can then be specified.
For this data collection, I discovered that epochs=3 works nicely.


In [None]:
epochs = 3
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

Performance METRICS will be the f1 score and accuracy per class.

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
import numpy as np

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cpu


In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
|#modelling by 3 epochs 
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    model.save_pretrained("/content/drive/MyDrive/Model-Output/")        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}') 

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/204 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.46727554994982246
Validation loss: 0.21971018048557076
F1 Score (Weighted): 0.9462395292163269


Epoch 2:   0%|          | 0/204 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.32989481915080665
Validation loss: 0.21594742605009595
F1 Score (Weighted): 0.9462395292163269


Epoch 3:   0%|          | 0/204 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.3283507509668376
Validation loss: 0.21446298928679647
F1 Score (Weighted): 0.9462395292163269


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
model.save_pretrained("/content/drive/MyDrive/Model-Output/", map_location=torch.device('cpu'))


In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)


In [None]:
accuracy_per_class(predictions, true_vals)


Class: Completed
Accuracy: 665/695

Class: Other personal reasons & dropped out
Accuracy: 0/12

Class: Other
Accuracy: 0/1

Class: Gone into employment
Accuracy: 0/2

Class: Health reasons
Accuracy: 0/3

Class: Transferred to another provider
Accuracy: 0/6

Class: Academic failure/left in bad standing/not permitted to progress
Accuracy: 0/1

Class: Exclusion
Accuracy: 0/1

