In [None]:
from transformers import AutoTokenizer, DistilBertModel
import torchtext
import pandas as pd
import ast

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
# Install required packages
import nltk
nltk.download('punkt')
nltk.download('stopwords')
!pip install readability

In [None]:
%load_ext autoreload
%autoreload 2

import os
import json

import numpy as np
import gensim.downloader
from torch.utils.data import DataLoader

In [None]:
conll2000_dataset = pd.read_csv('data2.csv', usecols=('content', 'bias'), nrows=5000)
#conll2000_dataset['ID'] = conll2000_dataset['ID'].apply(lambda x: ast.literal_eval(x))
#conll2000_dataset['content_original'] = conll2000_dataset['content_original'].apply(lambda x: ast.literal_eval(x))
#conll2000_dataset['content_original'] = conll2000_dataset['content_original'].apply(lambda x: ast.literal_eval(x))
#conll2000_dataset['bias_text'] = conll2000_dataset['bias_text'].apply(lambda x: ast.literal_eval(x))
#conll2000_dataset['bias'] = conll2000_dataset['bias'].apply(lambda x: ast.literal_eval(x))

conll2000_dataset["content"] = [nltk.word_tokenize(row) for row in conll2000_dataset["content"]]

display(conll2000_dataset)

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = DistilBertModel.from_pretrained(model_name)

In [None]:
# Get the tokens for the first data entry
tokens = conll2000_dataset.iloc[0]['content']
print('Original tokens:')
print(tokens)

# Convert these tokens into DistilBERT's token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Token IDs:')
print(token_ids)

# Convert the token IDs back into regular tokens
tokens_backtranslated = tokenizer.convert_ids_to_tokens(token_ids)
print('Backtranslated tokens:')
print(tokens_backtranslated)
# Notice that it automatically does UNK replacement for us!

In [None]:
from fine_tune import POSTagDataset

dataset = POSTagDataset(conll2000_dataset, tokenizer)
(train, val, test) = torch.utils.data.random_split(dataset, [0.8, 0.1, 0.1])

print(train[0]) # verify that this has the correct structure

In [None]:
from fine_tune import basic_collate_fn

# grab a test minibatch
test_minibatch = [train[0], train[1]]
batch_in, batch_out = basic_collate_fn(test_minibatch)
print(batch_in['input_ids'].size())
print(batch_in["attention_mask"].size())

In [None]:
train_loader = torch.utils.data.DataLoader(train, batch_size=64, collate_fn=basic_collate_fn, shuffle=True)
val_loader = torch.utils.data.DataLoader(val, batch_size=64, collate_fn=basic_collate_fn, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=64, collate_fn=basic_collate_fn, shuffle=False)

batch_in, pos_ids = next(iter(train_loader))
print(batch_in['input_ids'].size())
print(batch_in['attention_mask'].size())

In [None]:
from fine_tune import DistilBertForTokenClassification

hidden_dim = 768 # this is fixed for BERT models
#num_pos_tags =  len(dataset.pos_tags.keys())
num_pos_tags = 3
model = DistilBertForTokenClassification(bert_model, hidden_dim, num_pos_tags)

In [None]:
model.forward(**batch_in)

In [None]:
output = model(**batch_in)
print(output.size())
print(num_pos_tags)

In [None]:
from fine_tune import get_loss_fn, calculate_loss

loss_fn = get_loss_fn()
print(f"Loss function: {loss_fn}")

# Test loss of randomly generated labels and scores
labels = torch.randint(0, num_pos_tags, (10,))
print(labels)
scores = torch.rand((10, num_pos_tags))
loss = calculate_loss(scores, labels, loss_fn)
print(f"Loss: {loss}")

# Test loss of padded labels
labels = torch.zeros((10,), dtype=torch.long) # make the POS tags all padding tokens
labels[0] = 1 # except one; set this one to an arbitrary value
#labels[1] = 10
print(labels)
loss = calculate_loss(scores, labels, loss_fn)
print(f"Loss of padded inputs: {loss}")
scores[1:, :] = torch.rand((9, num_pos_tags)) # change the values of the rest of the tokens
print(labels)
loss = calculate_loss(scores, labels, loss_fn)
print(f"Loss of modified padded: {loss}")

In [None]:
device = "cuda"

In [None]:
from fine_tune import get_optimizer, train_model

# Run this lines to reload the model
bert_model = DistilBertModel.from_pretrained(model_name)
model = DistilBertForTokenClassification(bert_model, hidden_dim, num_pos_tags)

# Run some test optimization
model.to(device)
optim = get_optimizer(model, lr=5e-5, weight_decay=0)
best_model, stats = train_model(model, val_loader, val_loader, optim,
                                num_epoch=25, collect_cycle=5, device=device)

In [None]:
from fine_tune import plot_loss
plot_loss(stats)

In [None]:
from fine_tune import get_optimizer, train_model

# Run this lines to reload the model
bert_model = DistilBertModel.from_pretrained(model_name)
model = DistilBertForTokenClassification(bert_model, 768, num_pos_tags)

# Run the full optimization
model.to(device)
optim = get_optimizer(model, lr=0.0001, weight_decay=0)

In [None]:
from fine_tune import plot_loss
plot_loss(stats)

In [None]:
import itertools
from tqdm.notebook import tqdm
from fine_tune import get_hyper_parameters, plot_loss

def search_param_utterance():
    """Experiemnt on different hyper parameters."""
    learning_rate, weight_decay = get_hyper_parameters()
    print("learning rate from: {}\nweight_decay from: {}".format(
        learning_rate, weight_decay
    ))
    best_model, best_stats = None, None
    best_accuracy, best_lr, best_wd, best_hd = 0, 0, 0, 0
    for lr, wd in tqdm(itertools.product(learning_rate, weight_decay),
                           total=len(learning_rate) * len(weight_decay)):
        ############################## START OF YOUR CODE ##############################
        
        bert = DistilBertModel.from_pretrained(model_name)
        model = DistilBertForTokenClassification(bert, 768, num_pos_tags).to(device)
        optim = get_optimizer(model, lr=lr, weight_decay=wd)
        model, stats = train_model(model, train_loader, val_loader, optim, num_epoch=24, collect_cycle=20, device=device)
        if stats["accuracy"] > best_accuracy:
            best_model = model
            best_stats = stats
            best_accuracy = stats["accuracy"]
            best_lr = lr
            best_wd = wd

        ############################### END OF YOUR CODE ###############################
    print("\n\nBest learning rate: {}, best weight_decay: {}".format(best_lr, best_wd))
    print("Accuracy: {:.4f}".format(best_accuracy))
    plot_loss(best_stats)
    return best_model
best_model = search_param_utterance()

In [None]:
from fine_tune import get_validation_performance

get_validation_performance(best_model, get_loss_fn(), test_loader, 'cuda')

In [None]:
from fine_tune import make_prediction

y_true, y_pred, errors = make_prediction(best_model, test_loader, 'cuda')
print(test[0]['tokens'], y_true[1], y_pred[1])
print(errors)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

labels = ["left", "center", "right"]
cm = confusion_matrix(y_true, y_pred, normalize="true")
_, ax = plt.subplots(figsize=(3, 3))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
plt.title("Normalized confusion matrix")
plt.show()

In [None]:
conll2000_dataset_test = pd.read_csv('conll2000_test_unknown.csv', usecols=('tokens', 'pos_tags'))
conll2000_dataset_test['tokens'] = conll2000_dataset_test['tokens'].apply(lambda x: ast.literal_eval(x))
conll2000_dataset_test['pos_tags'] = conll2000_dataset_test['pos_tags'].apply(lambda x: ast.literal_eval(x))
display(conll2000_dataset_test)