In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import gc
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import torch
from torch import nn
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup, AdamW, BertModel, BertTokenizer
from tqdm import tqdm

In [None]:
df_main = pd.read_csv('./labeled.csv')
df_unlabeled = pd.read_csv('./unlabeled.csv')
#df_main = df_main[df_main['stance'].notna()]
#df_main['index'] = df_main.index

In [None]:
df_main.head()

In [None]:
def clean_text(text):
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\x00-\x7F]', '', text)
    

    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'&amp', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    text = text.replace('_', '')
    text = text.replace('-', '')

    # Convert to lowercase
    text = text.lower()

    return text

def clean_dataframe(df, column_name):
    # Clean text in the specified column of the DataFrame
    df[column_name] = df[column_name].apply(clean_text)
    return df

In [None]:
# clean data
df_main = clean_dataframe(df_main, 'text')
df_unlabeled = clean_dataframe(df_unlabeled, 'text')

In [None]:
le = LabelEncoder()

df_main["stance_enc"] = le.fit_transform(df_main["stance"])
df_main.head()

#### BERT

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
loaded_train = df_main.groupby(['stance_enc']).apply(lambda grp: grp.sample(n=33))
loaded_val = df_main.groupby(['stance_enc']).apply(lambda grp: grp.sample(n=11))

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df, labels_col, text_col, pretrained='bert-large-uncased'):
        self.labels_col = labels_col
        self.labels = df[self.labels_col].to_list()
        self.text_col = text_col
        self.tokenizer = BertTokenizer.from_pretrained(pretrained) 
        self.texts = [self.tokenizer(text, padding = 'max_length', max_length = 512, truncation = True,
                                return_tensors = 'pt') for text in df[self.text_col]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

    
class BertClassifier(nn.Module):
    def __init__(self, num_classes, dropout=0.5, pretrained='bert-base-uncased'):
        super(BertClassifier, self).__init__()
        self.num_classes = num_classes
        self.bert = BertModel.from_pretrained(pretrained)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.bert.config.hidden_size, self.num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_id, mask):
        outputs = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_output = self.softmax(linear_output)
        return final_output
    
class Train():
    def __init__(self, model, train_data, train_col, labels_col, val_data, criterion, optimizer, epochs, batch_size, retrain, model_path):
        self.model = model
        self.train_data = train_data
        self.train_col = train_col
        self.labels_col = labels_col
        self.val_data = val_data
        self.criterion = criterion
        self.optimizer = optimizer
        self.epochs = epochs
        self.batch_size = batch_size
        self.retrain = retrain
        self.model_path = model_path

    def train_plots(self, epochs_list, train_losses, val_losses, train_accs, val_accs):
        get_ipython().run_line_magic('matplotlib', 'inline')
        sns.set(rc={'figure.figsize':(19, 9)})
        fig, ax = plt.subplots(1,2)
        ax[0].plot(epochs_list, train_losses, label = 'Training Loss', marker='o')
        ax[0].plot(epochs_list, val_losses, label = 'Validation Loss', marker='o')
        ax[0].set_title('Loss Values')
        ax[0].set_xlabel('Epoch')
        ax[0].set_ylabel('Value')
        ax[1].plot(epochs_list, train_accs, label = 'Training Accuracy', marker='o')
        ax[1].plot(epochs_list, val_accs, label = 'Validation Accuracy', marker='o')
        ax[1].set_title('Accuracy Values')
        ax[1].set_xlabel('Epoch')
        ax[1].set_ylabel('Percent (%)')
        ax[0].legend()
        ax[1].legend()
        plt.show()
        fig.savefig('./plots/train-val-loss-accs.png')

    def start_train(self):
        train, val = Dataset(self.train_data, self.labels_col, self.train_col), Dataset(self.val_data, self.labels_col, self.train_col)
        train_dataloader = torch.utils.data.DataLoader(train, self.batch_size, shuffle = True)
        val_dataloader = torch.utils.data.DataLoader(val, self.batch_size)
        
        use_cuda = torch.cuda.is_available()
        print('CUDA:', use_cuda)
        device = torch.device('cuda' if use_cuda else 'cpu')

        if use_cuda:
            self.model = self.model.to(device)
            self.criterion = self.criterion.to(device)
            
        total_steps = len(train_dataloader)*self.epochs
        scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        if self.retrain == True:
            self.model.load_state_dict(torch.load(self.model_path, map_location = 'cpu'))
        train_losses = []
        val_losses = []
        train_accs = []
        val_accs = []
        epochs_list = []
        

        for epoch_num in range(self.epochs):
            print('\n====================== Epoch {:} / {:} =====================\n'.format(epoch_num + 1, self.epochs))
            total_loss_train = 0
            total_acc_train = 0
            self.model.train()
            for train_input, train_label in tqdm(train_dataloader, desc=f"Training Epoch {epoch_num + 1}"):
                train_label = train_label.type(torch.LongTensor)
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = self.model(input_id, mask)

                batch_loss = self.criterion(output, train_label)
                total_loss_train += batch_loss.item()

                acc_tr = (output.argmax(dim = 1) == train_label).sum().item()
                total_acc_train += acc_tr
                
                self.model.zero_grad()
                batch_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                self.optimizer.step()
                scheduler.step()

            # Validation
            total_loss_val = 0
            total_acc_val = 0
            
            self.model.eval()

            with torch.no_grad():
                for val_input, val_label in val_dataloader:
                    val_label = val_label.type(torch.LongTensor)
                    val_label = val_label.to(device)
                        
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = self.model(input_id, mask)
                    #label_ids = val_label.to('cpu').numpy()
                    
                    batch_loss = self.criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc_val = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc_val

            train_loss = total_loss_train / len(train_dataloader)
            val_loss = total_loss_val / len(val_dataloader)
            train_acc = total_acc_train / len(train)
            val_acc = total_acc_val / len(val)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            
            train_accs.append(train_acc)
            val_accs.append(val_acc)
            epochs_list.append(epoch_num + 1)

            self.train_plots(epochs_list, train_losses, val_losses, train_accs, val_accs)

            print(f'Epoch {epoch_num + 1} / {self.epochs}')
            print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
            print(f'Val Loss: {val_loss:.3f} | Val Acc: {val_acc * 100:.2f}%')

        # Save the final model
        if self.retrain == False:
            torch.save(self.model.state_dict(), './bert_base.pth')
            print('Base model has been saved!')
        else:
            torch.save(self.model.state_dict(), './bert_retrained.pth')
            print('Retrained model has been saved!')


class UnseenDataset(torch.utils.data.Dataset):
    def __init__(self, df, text_col, pretrained='bert-large-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained)
        self.texts = self.tokenizer(df[text_col].tolist(),
                                    padding='max_length',
                                    max_length=512,
                                    truncation=True,
                                    return_tensors='pt')
                                    
    def __len__(self):
        return self.texts['input_ids'].shape[0]

    def __getitem__(self, idx):
        return {key: tensor[idx] for key, tensor in self.texts.items()}

class Test():
    def __init__(self, model, test_data, labels_col, batch_size, is_multi_label=False):
        self.model = model
        self.test_data = test_data
        self.labels_col = labels_col
        self.batch_size = batch_size
        self.is_multi_label = is_multi_label

    def plot_metrics(self, labels, outputs):
        labels = np.concatenate(labels)
        outputs = F.softmax(torch.cat(outputs), dim=1).cpu().numpy()
        preds = outputs[:, 1]
        fpr, tpr, _ = roc_curve(labels, preds)
        roc_auc = auc(fpr, tpr)
        
        if self.is_multi_label:
            roc_auc = roc_auc_score(labels, outputs, multi_class='ovr')
            print('ROC AUC Score: ', roc_auc)
        
        y_pred = np.where(preds > 0.5, 1, 0)
        print('\nClassification Report:\n', classification_report(labels, y_pred))
        cm = confusion_matrix(labels, y_pred)
        
        plt.figure(figsize=(9, 6))
        plt.plot(fpr, tpr, 'b', label=f'AUC = {roc_auc:.2f}')
        plt.legend(loc='lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.savefig('./plots/roc-curve.png')
        plt.show()
        
        cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        cm_disp.plot()

    def start_test(self):
        test_dataset = Dataset(self.test_data, self.labels_col)
        test_dataloader = torch.utils.data.DataLoader(test_dataset, self.batch_size)

        use_cuda = torch.cuda.is_available()
        device = torch.device('cuda' if use_cuda else 'cpu')
        if use_cuda:
            self.model = self.model.cuda()
        self.model.eval()
        total_acc_test = 0
        test_outputs = []
        test_labels = []
        
        with torch.no_grad():
            for test_input, test_label in test_dataloader:
                test_label = test_label.type(torch.LongTensor).to(device)
                mask = test_input['attention_mask'].to(device)
                input_id = test_input['input_ids'].squeeze(1).to(device)

                output = self.model(input_id, mask)
                acc = (output.argmax(dim=1) == test_label).sum().item()
                total_acc_test += acc
                
                test_labels.append(test_label.cpu().numpy())
                test_outputs.append(output)
                
        self.plot_metrics(test_labels, test_outputs)
        print(f'Test Accuracy: {total_acc_test / len(self.test_data) * 100:.3f} %')


class Predict():
    def __init__(self, model, model_path, unseen_data, labels_col, text_col, original_idx, batch_size):
        self.model = model
        self.model_path = model_path
        self.unseen_data = unseen_data
        self.labels_col = labels_col
        self.text_col = text_col
        self.original_idx = original_idx
        self.batch_size = batch_size

    def start_predict(self):
        test_dataset = UnseenDataset(self.unseen_data, self.text_col)
        test_dataloader = torch.utils.data.DataLoader(test_dataset, self.batch_size, shuffle=False)
        
        use_cuda = torch.cuda.is_available()
        print('CUDA:', use_cuda)
        device = torch.device('cuda' if use_cuda else 'cpu')

        if use_cuda:
            self.model = self.model.to(device)
        
        self.model.load_state_dict(torch.load(self.model_path, map_location=device))
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for test_input in test_dataloader:
                mask = test_input['attention_mask'].to(device)
                input_id = test_input['input_ids'].squeeze(1).to(device)
                output = self.model(input_id, mask)
                predictions.append(output.cpu().numpy())
                
        predictions = np.concatenate(predictions, axis=0)
        self.unseen_data[self.labels_col] = predictions.argmax(axis=1).astype(int)
        predicted_data = self.unseen_data[[self.original_idx, self.text_col, self.labels_col]]
        predicted_data.to_csv('predicted.csv', index=False)

In [None]:
model = BertClassifier(num_classes = 3)
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8) #1e-3 bad, 1e-5 better, 2e-3 bad # 2e-5  good for binary
loss_func = nn.CrossEntropyLoss()   
epochs = 5
batch_size = 16

In [None]:
print('\nTrain/Validation: \n', len(loaded_train), '/', len(loaded_val))
train = Train(model, loaded_train, 'text', 'stance_enc', loaded_val, loss_func, optimizer, epochs, batch_size, False, '') 
train.start_train()

In [None]:
unseen_data = df_unlabeled
unseen_data.head()

In [None]:
model_path = './bert_base.pth'
unseen_data = df_unlabeled

predict = Predict(model, model_path, unseen_data, 'stance_enc', 'text', 'index1', batch_size)
predict.start_predict()

In [None]:
predicted  = pd.read_csv('predicted.csv')
predicted

In [None]:
predicted['stance_enc'].value_counts()

In [None]:
combined_data = pd.concat([predicted, df_main[['index1', 'text', 'stance_enc']]], axis = 0)
combined_data.head()

In [None]:
combined_data['stance_enc'].value_counts()

In [None]:
combined_data = clean_dataframe(combined_data, 'text')
combined_data.head()

In [None]:
df_train, df_val, df_test = np.split(combined_data.sample(frac=1, random_state=42), [int(.8 * len(combined_data)), int(.9 * len(combined_data))])

print(len(df_train))
print(len(df_val))
print(len(df_test))

In [None]:
df_train['stance_enc'].value_counts()

In [None]:
loaded_train = df_train
loaded_val = df_val
loaded_test = df_test

model = BertClassifier(num_classes = 3)
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)
loss_func = nn.CrossEntropyLoss()   
epochs = 5
batch_size =  16
model_path = './bert_base.pth'

In [None]:
print('\nTrain/Validation: \n', len(loaded_train), '/', len(loaded_val))
train = Train(model, loaded_train, 'text', 'stance_enc', loaded_val, loss_func, optimizer, epochs, batch_size, True, model_path) 
train.start_train()

In [None]:
class Test():
    def __init__(self, model, test_data, labels_col, train_col, batch_size, multi_label):
        self.model = model
        self.test_data = test_data
        self.batch_size = batch_size
        self.labels_col = labels_col
        self.train_col = train_col
        self.multi_label = multi_label
        
    def plot_metrics(self, labels, outputs):
        labels = torch.cat(labels, dim = 0).cpu().numpy()
        outputs = torch.cat(outputs, dim = 0)
        probs = torch.nn.functional.softmax(outputs, dim = 1).cpu().numpy()
        preds = probs[:, 1]
       
        # Classification Report
        y_pred = np.where(preds > 0.5, 1, 0)
        
        print('\nClassification Report:\n', classification_report(labels, y_pred))
        
        # Confusion Matrix
        cm = confusion_matrix(labels, y_pred)
        cm_df = pd.DataFrame(cm)
        sns.heatmap(cm_df, annot=True, fmt=".1f")

    def start_test(self):
        test = Dataset(self.test_data, self.labels_col, self.train_col)
        test_dataloader = torch.utils.data.DataLoader(test, self.batch_size)

        use_cuda = torch.cuda.is_available()
        device = torch.device('cuda' if use_cuda else 'cpu')
        self.model = self.model.to(device)

        self.model.eval()
        total_acc_test = 0
        test_outputs = []
        test_labels = []
        with torch.no_grad():
            for test_input, test_label in test_dataloader:
                test_label = test_label.type(torch.LongTensor).to(device)
                mask = test_input['attention_mask'].to(device)
                input_id = test_input['input_ids'].squeeze(1).to(device)

                output = self.model(input_id, mask)
                acc = (output.argmax(dim = 1) == test_label).sum().item()
                total_acc_test += acc
                
                test_labels.append(test_label.cpu())
                test_outputs.append(output.cpu())
        self.plot_metrics(test_labels, test_outputs)

In [None]:
test = Test(model, loaded_test, 'stance_enc', 'text', batch_size, True)
test.start_test()