***Final Project: Training BERT clasifier** </br>
The program trains the BERT Classifier to read text and categorise as either related or unrelated to shif-dropping</br>
The training dataset comprises email bodies obtained from jackline.w.gathoni.24@dartmouth.edu and dahlia.igiraneza.24@dartmouth.edu, appropriately labeled as either 'Related' or 'Unrelated' based on the email's relevance to shift dropping. </br>
Calendar credentials and events are from jackline.w.gathoni.24@dartmouth.edu </br>
Dartmouth College, LING48, Spring 2023  </br>
Jackline Gathoni (jackline.w.gathoni.24@dartmouth.edu) </br>
Dahlia Igiraneza (dahlia.igiraneza.24@dartmouth.edu)  </br>
Paige Nakai (paige.m.nakai.24@dartmouth.edu)  </br>
Getting calendar credentials code is sourced from: https://developers.google.com/calendar/api/quickstart/python </br>





In [None]:
!pip install transformers

In [1]:
import pandas as pd
import re
import torch
import numpy as np
import transformers
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
import nltk
from sklearn import metrics

ModuleNotFoundError: No module named 'torch'

Load the dataset

In [None]:
datapath = "/content/CS72 Project_Emails_training_dataset - Sheet1.csv"
df= pd.read_csv(datapath, encoding="'latin1")
df = df.dropna()
df.isnull().values.any()
df = df.sample(frac=1)
df.head()

In [None]:
df.groupby(['Category']).size().plot.bar()

In [None]:
def preprocess_text(sen): 
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [None]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [None]:
emails = []
sentences = list(df["Emails"])
for sen in sentences:
  emails.append(preprocess_text(sen))

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'Related': 0,
          'Unrelated': 1
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['Category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['Emails']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))
print(type(df_train))

In [None]:

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)


In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    y_preds = []
    y_trues = []

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)

              y_trues.append(test_label)

              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)
              y_preds.append(output.argmax(dim=1))

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
    # confusion_matrix = metrics.confusion_matrix(y_trues, y_preds, labels=["Related", "Unrelated"])
    # print(confusion_matrix)
    
evaluate(model, df_test)

In [7]:
def preprocessToken(input_text, tokenizer):
  return tokenizer.encode_plus(input_text, 
                               add_special_tokens = True,
                               max_length = 512,
                               padding='longest',
                               return_attention_mask = True,
                               return_tensors = 'pt'
                               )