# BERT and Torch

In [10]:
%reload_ext autoreload
%autoreload 2

In [11]:
import bz2
import os
import pickle
import scipy as sp
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

sys.path.append(os.path.abspath('../src'))
from fact_classification import *

## Load datafiles

In [12]:
df, df_crowdsourced, df_ground_truth = data_loading(local=True)
df['Sentiment'] = df.Sentiment.fillna(df.Sentiment[df.Verdict == -1].mean())

## Load features
Load the features matrix that we generated in the `feature_generation.ipynb` notebook. This is a large sparse matrix so ww convert it to Compressed Sparse Row (CSR) format to avoid running out of memory when fitting our models.

In [13]:
with bz2.open('../results/df_features.bz2') as f:
    df_features = pickle.load(f)

# Convert to compressed sparse row matrix
X = sp.sparse.csr_matrix(df_features)

## Split data and generate indexes

We split the dataset according to the instructions in the assignment, where data up until and including year 2008 will be used for training, and data after 2008 will be used for testing. Here we also generate indexes for the various feature sets.

In [14]:
df_train, df_test, idx_train = test_train_split(df)

y = df['Verdict']
y_train = df_train['Verdict']
y_test = df_test['Verdict']

X_train = X[idx_train]
X_test = X[~idx_train]

# Column index for the numeric columns Sentiment and Length
col_idx_n = (df_features.columns == 'Sentiment') | (df_features.columns == 'Length')

# Column index for TF-IDF features on the raw Text column with n-grams=1
col_idx_w1 = df_features.columns.str.startswith('W1_')

# Column index for TF-IDF features on the raw Text column with n-grams=2
col_idx_w2 = df_features.columns.str.startswith('W2_')

# Column index for TF-IDF features on the stemmed text with n-grams=1
col_idx_ws = df_features.columns.str.startswith('WS_')

# Column index for POS features
col_idx_p = df_features.columns.str.startswith('P_')

# Column index for NER labels
col_idx_e = df_features.columns.str.startswith('E_')

In [15]:
# Preprocessing: Tokenize using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_and_encode(texts):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

X_train = tokenize_and_encode(train_df["Text"].tolist())
X_test = tokenize_and_encode(test_df["Text"].tolist())

y_train = torch.tensor(train_df["Verdict"].tolist()).add(1)  # Add 1 to shift labels from [-1, 0, 1] to [0, 1, 2]
y_test = torch.tensor(test_df["Verdict"].tolist()).add(1)

# Create a PyTorch dataset
class CrowdsourcedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CrowdsourcedDataset(X_train, y_train)
test_dataset = CrowdsourcedDataset(X_test, y_test)

# Instantiate the BERT model
model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define the Trainer and TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the BERT classifier
trainer.train()

# Test and evaluate the classifier
y_pred = trainer.predict(test_dataset).predictions.argmax(axis=-1)

# Shift labels back to original range [-1, 0, 1]
y_pred = y_pred - 1
y_test = y_test - 1

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("Classification Report:")
print(classification_report(y_test, y_pred))


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


NameError: name 'train_df' is not defined