<a href="https://colab.research.google.com/github/Vic-the-Legend/Transformers-and-NER/blob/main/BERT_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m409.6/519.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-

In [None]:
from datasets import load_dataset, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments, pipeline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import gc
import torch
from scipy import special

In [None]:
# Specify number of labels when training
num_labels = 2

# Change the number of labels
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = num_labels).to('cuda')

In [None]:
from datasets import load_dataset
phrases = load_dataset('financial_phrasebank', 'sentences_50agree')
data = phrases['train']
data.set_format("pandas")
data = data[:]
data

In [None]:
data["label"] = np.where(data['label'] >= 1, 1, 0)
data["label"].value_counts().plot(kind = 'bar')

In [None]:
df_train, df_test = train_test_split(data, shuffle = True, test_size=0.2, stratify=data['label'])
df_train, df_val = train_test_split(df_train, test_size=0.125, stratify=df_train['label'])
print(df_train.shape, df_test.shape, df_val.shape)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
dataset_train_DistilBERT = Dataset.from_pandas(df_train)
dataset_val_DistilBERT = Dataset.from_pandas(df_val)
dataset_test_DistilBERT = Dataset.from_pandas(df_test)

dataset_train_DistilBERT = dataset_train_DistilBERT.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_val_DistilBERT = dataset_val_DistilBERT.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset_test_DistilBERT = dataset_test_DistilBERT.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length' , max_length=128), batched=True)

dataset_train_DistilBERT.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
dataset_val_DistilBERT.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
dataset_test_DistilBERT.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
num_labels = len(data["label"].unique())
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = num_labels).to('cuda')

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

batch_size = 32
logging_steps = df_train.shape[0] // batch_size


args_DistilBERT = TrainingArguments(
        output_dir = 'temp/',
        num_train_epochs=5,
        learning_rate=2e-5,
        warmup_steps = 0,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        #weight_decay=0.01,
        evaluation_strategy = 'epoch',
        logging_steps=logging_steps
        #save_strategy = 'epoch',
        #load_best_model_at_end=True,
        #metric_for_best_model='accuracy',
)

trainer_DistilBERT = Trainer(
        model=model,
        args=args_DistilBERT,
        train_dataset=dataset_train_DistilBERT,         # training dataset
        eval_dataset=dataset_val_DistilBERT,            # evaluation dataset
        compute_metrics=compute_metrics
)

trainer_DistilBERT.train()

In [None]:
trainer_DistilBERT.evaluate()

In [None]:
def plot_confusion_matrix(y_pred, y_true, labels):
    cm = confusion_matrix(y_true, y_pred, normalize="true")
    _, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

In [None]:
prediction = trainer_DistilBERT.predict(dataset_test_DistilBERT)

In [None]:
plot_confusion_matrix(np.argmax(prediction.predictions, axis=1), df_test['label'], ["Neg", "Non-Neg"])

### Save model

In [None]:
save_directory = "two_label/"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

### Test saved model

In [None]:
# Directory of saved tuned model on financial phrase data
save_directory = "two_label/"
model = DistilBertForSequenceClassification.from_pretrained(save_directory)
tokenizer = DistilBertTokenizer.from_pretrained(save_directory)
trained_model = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) # top_k = None allows for all probabilities to show

In [None]:
predictions = trained_model.predict(df_test["sentence"].tolist())

In [None]:
indices = [int(pred['label'][-1]) for pred in predictions]
plot_confusion_matrix(indices, df_test['label'], ["Neg", "Non-Neg"])