In [None]:
#libraries and packages
#!pip install torch
import os
import torch
import pandas as pd
from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

import regex as re
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import AdamW, get_linear_schedule_with_warmup

import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

import random

In [2]:
def transform_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special entities, tickers, and URLs
    text = re.sub(r'\&\w*;|#\w*|@\w*', '', text)
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    # Remove small words, whitespace, and non-alphanumeric characters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\s\s+', ' ', text).strip()
    
    # Tokenize text into words
    words = nltk.word_tokenize(text)
    
    # Remove non-alphanumeric characters
    words = [word for word in words if word.isalnum()]
    
    # Remove stopwords and punctuation
    stopwords_set = set(stopwords.words('english'))
    punctuation_set = set(string.punctuation)
    words = [word for word in words if word not in stopwords_set and word not in punctuation_set]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words into a string and return
    return ' '.join(lemmatized_words)

In [3]:
#f1 score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    f1 = f1_score(labels_flat, preds_flat, average='weighted')
    precision = precision_score(labels_flat, preds_flat, average='weighted')
    recall = recall_score(labels_flat, preds_flat, average='weighted')
    
    return f1, precision, recall

In [4]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    # Convert predictions to class labels
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        # Filter predictions and true labels for the current class
        y_preds = preds_flat
        y_true = labels_flat
        
        # Calculate TP, TN, FP, FN
        TP = np.sum((y_preds == label) & (y_true == label))
        TN = np.sum((y_preds != label) & (y_true != label))
        FP = np.sum((y_preds == label) & (y_true != label))
        FN = np.sum((y_preds != label) & (y_true == label))
        
        # Avoid division by zero
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0
        
        # Print metrics for the class
        print(f'Class: {label_dict_inverse[label]}')
        print(f'TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}')
        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1 Score: {f1_score:.2f}\n')

In [5]:
def evaluate(dataloader_val):
    #evaluation mode disables the dropout layer 
    model.eval()
    #tracking variables
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        #load into GPU
        batch = tuple(b.to(device) for b in batch)
        
        #define inputs
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}

        #compute logits
        with torch.no_grad():        
            outputs = model(**inputs)
        
        #compute loss
        loss = outputs[0]
        logits = outputs[1]
        loss = outputs[0] #output.loss
        if loss.dim() > 0:
            loss = loss.mean()
        loss_val_total += loss.item()

        #compute accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    #compute average loss
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
data_dir = "data/"
for folder_name in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder_name)

    # Define file paths
    train_file = os.path.join(folder_path, f"train.csv")
    dev_file = os.path.join(folder_path, f"val.csv")
    test_file = os.path.join(folder_path, f"test.csv")
    # Check if it's a directory
    if os.path.isdir(folder_path):
        print("=====================================Processing folder:========================================")
        print("=====================================Processing folder:========================================")
        print("=====================================Processing folder:========================================")
        print(f"Processing folder: {folder_name}")

        # Load the data
        df_train = pd.read_csv(train_file)
        df_dev = pd.read_csv(dev_file)
        df_test = pd.read_csv(test_file)

        #Data processing
        df_train.set_index('UserName', inplace = True)
        df_dev.set_index('UserName', inplace = True)
        df_test.set_index('UserName', inplace = True)
        # Apply preprocessing to the text column
        df_train['OriginalTweet'] = df_train['OriginalTweet'].apply(transform_text)
        df_dev['OriginalTweet'] = df_dev['OriginalTweet'].apply(transform_text)
        df_test['OriginalTweet'] = df_test['OriginalTweet'].apply(transform_text)
        print(df_train.head())
        # print(df_dev.head())
        # print(df_test.head())

        #store classes into an array
        possible_labels = df_train.Sentiment.unique()

        #convert labels into numeric values
        label_dict = {}
        for index, possible_label in enumerate(possible_labels):
            label_dict[possible_label] = index
        #convert labels into numeric values
        df_train['label'] = df_train.Sentiment.replace(label_dict)
        df_dev['label'] = df_dev.Sentiment.replace(label_dict)
        df_test['label'] = df_test.Sentiment.replace(label_dict)

        X_train = df_train["OriginalTweet"]  
        y_train = df_train["Sentiment"]     

        X_dev = df_dev["OriginalTweet"]  
        y_dev = df_dev["Sentiment"] 

        X_test = df_test["OriginalTweet"]    
        y_test = df_test["Sentiment"]

        #create new column
        df_train['data_type'] = ['train'] * df_train.shape[0]
        df_dev['data_type'] = ['val'] * df_dev.shape[0]
        df_test['data_type'] = ['test'] * df_test.shape[0]

        df_train.groupby(['Sentiment', 'label', 'data_type']).count()
        df_dev.groupby(['Sentiment', 'label', 'data_type']).count()
        df_test.groupby(['Sentiment', 'label', 'data_type']).count()

        # feature tokenize
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                         do_lower_case = True)
        encoded_data_train = tokenizer.batch_encode_plus(df_train[df_train.data_type == 'train'].OriginalTweet.values,
                                                add_special_tokens = True,
                                                return_attention_mask = True,
                                                pad_to_max_length = True,
                                                max_length = 150,
                                                return_tensors = 'pt')
        encoded_data_val = tokenizer.batch_encode_plus(df_dev[df_dev.data_type == 'val'].OriginalTweet.values,
                                                #add_special_tokens = True,
                                                return_attention_mask = True,
                                                pad_to_max_length = True,
                                                max_length = 150,
                                                return_tensors = 'pt')
        encoded_data_test = tokenizer.batch_encode_plus(df_test[df_test.data_type == 'test'].OriginalTweet.values,
                                                #add_special_tokens = True,
                                                return_attention_mask = True,
                                                pad_to_max_length = True,
                                                max_length = 150,
                                                return_tensors = 'pt')
            
        #encode train set
        input_ids_train = encoded_data_train['input_ids']
        attention_masks_train = encoded_data_train['attention_mask']
        labels_train = torch.tensor(df_train[df_train.data_type == 'train'].label.values)

        #encode val set
        input_ids_val = encoded_data_val['input_ids']
        attention_masks_val = encoded_data_val['attention_mask']
        labels_val = torch.tensor(df_dev[df_dev.data_type == 'val'].label.values)

        #encode test set
        input_ids_test = encoded_data_test['input_ids']
        attention_masks_test = encoded_data_test['attention_mask']
        labels_test = torch.tensor(df_test[df_test.data_type == 'test'].label.values)
        #create dataloader
        dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

        dataset_val = TensorDataset(input_ids_val, 
                             attention_masks_val, 
                             labels_val)

        dataset_test = TensorDataset(input_ids_test, 
                              attention_masks_test,
                              labels_test)

        #load pre-trained BERT
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(label_dict),
                                                      output_attentions = False,
                                                      output_hidden_states = False)
        batch_size = 32

        #load train set
        dataloader_train = DataLoader(dataset_train,
                              sampler = RandomSampler(dataset_train),
                              batch_size = batch_size)

        #load val set
        dataloader_val = DataLoader(dataset_val,
                              sampler = RandomSampler(dataset_val),
                              batch_size = 32) #since we don't have to do backpropagation for this step

        dataloader_test = DataLoader(dataset_test,
                              sampler = RandomSampler(dataset_test),
                              batch_size = 32) 
        epochs = 30

        #load optimizer
        #optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8) #2e-5 > 5e-5
        optimizer = torch.optim.AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
        #load scheduler
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = len(dataloader_train)*epochs)

        #train model
        seed_val = 17
        random.seed(seed_val)
        np.random.seed(seed_val)
        torch.manual_seed(seed_val)
        torch.cuda.manual_seed_all(seed_val)

        print("==============================GPU number:===============================================")
        print("==============================GPU number:===============================================")
        print("==============================GPU number:===============================================")
        print("GPU number: ")
        print(torch.cuda.device_count())

        # # device = torch.device("cuda:0")
        # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # model.to(device)
        # print(device)

        # DDP
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
        model.to(device)

        best_val_loss = float('inf')

        for epoch in tqdm(range(1, epochs + 1)):
            #set model in train mode
            model.train()

            #tracking variable
            loss_train_total = 0
    
            #set up progress bar
            progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
            for batch in progress_bar:
                #set gradient to 0
                model.zero_grad()

                #load into GPU
                batch = tuple(b.to(device) for b in batch)

                #define inputs
                inputs = {'input_ids': batch[0],
                            'attention_mask': batch[1],
                            'labels': batch[2]}
        
                outputs = model(**inputs)
                loss = outputs[0] #output.loss
                if loss.dim() > 0:
                    loss = loss.mean()
                loss_train_total += loss.item()

                #backward pass to get gradients
                loss.backward()
        
                #clip the norm of the gradients to 1.0 to prevent exploding gradients
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
                #update optimizer
                optimizer.step()

                #update scheduler
                scheduler.step()
        
                progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
            tqdm.write('\nEpoch {epoch}')
    
            #print training result
            loss_train_avg = loss_train_total/len(dataloader_train)
            tqdm.write(f'Training loss: {loss_train_avg}')
    
            #evaluate
            val_loss, predictions, true_vals = evaluate(dataloader_val)

            save_path = os.path.join(folder_path, 'best_model.pth')

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save({
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'epoch': epoch,
                        'best_val_loss': best_val_loss,
                    }, save_path)

            #f1 score
            # val_f1 = f1_score_func(predictions, true_vals)
            val_f1, val_precision, val_recall = f1_score_func(predictions, true_vals)
            tqdm.write(f'Validation loss: {val_loss}')
            tqdm.write(f'precision (weighted): {val_precision}')
            tqdm.write(f'recall (weighted): {val_recall}')
            tqdm.write(f'F1 Score (weighted): {val_f1}')

        # evaluation
        # Load the saved model
        checkpoint = torch.load(save_path)

        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        best_val_loss = checkpoint['best_val_loss']

        model.eval()

        val_loss, predictions, true_vals = evaluate(dataloader_test)
        #f1 score
        # val_f1 = f1_score_func(predictions, true_vals)
        val_f1, val_precision, val_recall = f1_score_func(predictions, true_vals)
        print(f'Validation loss: {val_loss}')
        print(f'precision (weighted): {val_precision}')
        print(f'recall (weighted): {val_recall}')
        print(f'F1 Score (weighted): {val_f1}')

        new_data = {
                "Name": [folder_name],
                "precision": [val_precision],
                "recall": [val_recall],
                "F1": [val_f1],
            }
        new_row = pd.DataFrame(new_data)
        csv_filename = "BERT_results.csv"
        new_row.to_csv(csv_filename, mode='a', index=False, header=False)

        # #evaluate per class
        # _, predictions, true_vals = evaluate(dataloader_test)
        # #get accuracy score
        # accuracy_per_class(predictions, true_vals)