In [None]:
!pip install transformers
!pip install jax
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

bert_checkpoint = 'bert-base-cased'
roberta_checkpoint = 'roberta-base'

checkpoint = bert_checkpoint

datapath = "bbc_train.csv"
df = pd.read_csv(datapath)
df.groupby(['category']).size().plot.bar()
tokenizer = BertTokenizer.from_pretrained(checkpoint)

labels = {'business':0,
          'sport':1,
          'politics':2
          # 'entertainment':3,
          # 'tech':4,
          }

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 14.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


FileNotFoundError: ignored

In [None]:
df.head()

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 256, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [None]:
!pip install torchmetrics

In [None]:
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification
from torch import nn
#import evaluate as metric_evaluate
import torch.nn.functional as F
from torchmetrics.classification import MulticlassF1Score, MulticlassROC, MulticlassAUROC, MulticlassRecall

class TextClassifier(nn.Module): 

    def __init__(self, checkpoint, num_of_classes, dropout=0.5):

        super(TextClassifier, self).__init__()

        self.bert = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                                     num_labels=num_of_classes)
        self.num_of_labels = num_of_classes

    def forward(self, input_id, mask, labels):

        pooled_output = self.bert(input_ids= input_id, attention_mask=mask,
                                  labels=labels, return_dict=True)

        return pooled_output

def train(model, train_data, val_data, learning_rate, epochs, train_batch_size, val_batch_size):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=train_batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=val_batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            total_auc_train = 0
            total_recall_train = 0
            total_precision_train = 0
            total_f1_train = 0
            # total_fpr_train = 0
            # total_tpr_train = 0
            # total_thresholds_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask, train_label)
                #print(f"\n output:{output} \n")
                
                probabilities = F.softmax(output.logits, dim=1)
                #print(f"probabilities: {probabilities}")

                loss = criterion(probabilities, train_label.long())
                #print(f"loss: {loss}")

                (predictions, predictions_indicies) = torch.max(probabilities,dim=1)

                # print(f"predictions: {predictions}")
                # print(f"train_label: {train_label}")

                metric_auc = MulticlassAUROC(predictions, train_label, reorder=True)
                # print(f"metric_auc: {metric_auc}")

                metric_precision_recall = MulticlassRecall(preds=probabilities, target=train_label,num_classes=model.num_of_labels)
                precision, recall = metric_precision_recall[0], metric_precision_recall[1]
                # print(f"\nprecision: {precision}\n")
                # print(f"recall: {recall}\n")

                metric_f1 = MulticlassF1Score(num_classes=model.num_of_labels).to(device)
                f1 = metric_f1(predictions, train_label)
                #print(f"f1: {f1}\n")

                # metric_roc = MulticlassROC(num_classes=5)
                # fpr_tpr_thresholds_tuple = metric_roc(probabilities, train_label)
                # print(f"fpr: {fpr_tpr_thresholds_tuple[0]}\n")
                # print(f"tpr: {fpr_tpr_thresholds_tuple[1]}\n")
                # print(f"thresholds: {fpr_tpr_thresholds_tuple[2]}\n")

                total_loss_train += loss
              
                acc = (predictions_indicies == train_label).sum().item()
                total_acc_train += acc

                total_auc_train += metric_auc
                total_recall_train += recall
                total_precision_train += precision
                total_f1_train += f1
                # total_fpr_train += fpr_tpr_thresholds_tuple[0]
                # total_tpr_train += fpr_tpr_thresholds_tuple[1]
                #total_thresholds_train += fpr_tpr_thresholds_tuple[3]

                model.zero_grad()
                loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0
            total_recall_val = 0
            total_precision_val = 0
            total_f1_val = 0
            total_auc_val = 0
            # total_fpr_val = 0
            # total_tpr_val = 0
            # total_thresholds_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask, val_label)
                    #batch_loss = criterion(output, val_label.long())
                    #total_loss_val += batch_loss.item()
                    
                    probabilities = F.softmax(output.logits, dim=1)

                    loss = criterion(probabilities, val_label.long())
                    #print(f"loss: {loss}")

                    (predictions, predictions_indicies) = torch.max(probabilities,dim=1)

                    # print(f"predictions: {predictions}")
                    # print(f"train_label: {train_label}")

                    metric_auc = MulticlassAUROC(predictions, val_label, reorder=True)
                    # print(f"metric_auc: {metric_auc}")

                    metric_precision_recall = MulticlassRecall(preds=probabilities, target=val_label,num_classes=model.num_of_labels)
                    precision, recall = metric_precision_recall[0], metric_precision_recall[1]
                    # print(f"\nprecision: {precision}\n")
                    # print(f"recall: {recall}\n")

                    metric_f1 = MulticlassF1Score(num_classes=model.num_of_labels).to(device)
                    f1 = metric_f1(predictions, val_label)
                    #print(f"f1: {f1}\n")

                    # metric_roc = MulticlassROC(num_classes=5)
                    # fpr_tpr_thresholds_tuple = metric_roc(probabilities, train_label)
                    # print(f"fpr: {fpr_tpr_thresholds_tuple[0]}\n")
                    # print(f"tpr: {fpr_tpr_thresholds_tuple[1]}\n")
                    # print(f"thresholds: {fpr_tpr_thresholds_tuple[2]}\n")

                    total_loss_val += loss
                  
                    acc = (predictions_indicies == val_label).sum().item()
                    total_acc_val += acc

                    total_auc_val += metric_auc
                    total_recall_val += recall
                    total_precision_val += precision
                    total_f1_val += f1

            print(f"Epochs: {epoch_num + 1} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Recall: {total_recall_train / len(train_data): .3f} | Train Precision: {total_precision_train / len(train_data): .3f} | Train F1-Score: {total_f1_train / len(train_data): .3f} | Train AUC: {total_auc_train/ len(train_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | TraValin Recall: {total_recall_val / len(val_data): .3f} | Val Precision: {total_precision_val / len(val_data): .3f} | Val F1-Score: {total_f1_val / len(val_data): .3f} | Val AUC: {total_auc_val/ len(val_data): .3f}")
                  
def evaluate(model, test_data, test_batch_size):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=test_batch_size)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    #total_loss_test = 0
    total_auc_test = 0
    total_recall_test = 0
    total_precision_test = 0
    total_f1_test = 0

    probabilities_list = []
    logits_list = []
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask, test_label)
              #batch_loss = criterion(output, val_label.long())
              #total_loss_val += batch_loss.item()
              
              logits_list.append(output.logits)
              probabilities = F.softmax(output.logits, dim=1)
              probabilities_list.append(probabilities)
              #loss = criterion(probabilities, test_label.long())
              #print(f"loss: {loss}")

              (predictions, predictions_indicies) = torch.max(probabilities,dim=1)

              # print(f"predictions: {predictions}")
              # print(f"train_label: {train_label}")

              metric_auc = MulticlassAUROC(predictions, test_label, reorder=True)
              # print(f"metric_auc: {metric_auc}")

              metric_precision_recall = MulticlassRecall(preds=probabilities, target=test_label,num_classes=model.num_of_labels)
              precision, recall = metric_precision_recall[0], metric_precision_recall[1]
              # print(f"\nprecision: {precision}\n")
              # print(f"recall: {recall}\n")

              metric_f1 = MulticlassF1Score(num_classes=model.num_of_labels).to(device)
              f1 = metric_f1(predictions, test_label)
              #print(f"f1: {f1}\n")

              # metric_roc = MulticlassROC(num_classes=5)
              # fpr_tpr_thresholds_tuple = metric_roc(probabilities, train_label)
              # print(f"fpr: {fpr_tpr_thresholds_tuple[0]}\n")
              # print(f"tpr: {fpr_tpr_thresholds_tuple[1]}\n")
              # print(f"thresholds: {fpr_tpr_thresholds_tuple[2]}\n")

              #total_loss_val += loss

              acc = (predictions_indicies == test_label).sum().item()
              total_acc_test += acc

              total_auc_test += metric_auc
              total_recall_test += recall
              total_precision_test += precision
              total_f1_test += f1
  
    print(f"Test Accuracy: {total_acc_test / len(test_data): .3f} | Test Recall: {total_recall_test / len(test_data): .3f} | Test Precision: {total_precision_test / len(test_data): .3f} | Test F1-Score: {total_f1_test / len(test_data): .3f} | Test AUC: {total_auc_test/ len(test_data): .3f}")
    return probabilities_list, logits_list



In [None]:
np.random.seed(42)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train), len(df_val), len(df_test))

In [None]:
num_of_classes=3
model = TextClassifier(checkpoint, num_of_classes=num_of_classes)

In [None]:
EPOCHS = 1
LR = 1e-6
train_batch_size = 2
val_batch_size = 2

train(model, df_train, df_val, LR, EPOCHS, train_batch_size, val_batch_size)

In [None]:
test_batch_size = 12
probabilities_list = evaluate(model, df_test, test_batch_size)

In [None]:
probabilities_list, logits_list = evaluate(model, df_test, test_batch_size)

In [None]:
probabilities_list_file = 'probabilities_list_file.txt'
with open(probabilities_list_file) as f:
  for elem in probabilities_list:
    f.write(elem+"\n")

In [None]:
logits_list_file = 'logits_list_file.txt'
with open(logits_list_file) as f:
  for elem in logits_list:
    f.write(elem+"\n")