In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
from transformers import AutoTokenizer, BertForSequenceClassification
import evaluate
from datasets import Dataset, load_dataset

  from .autonotebook import tqdm as notebook_tqdm
2025-04-15 22:25:46.859251: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744748746.879544  241746 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744748746.885685  241746 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744748746.901631  241746 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744748746.901654  241746 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744748746.901656  241746

In [2]:
# the given loading function, modified for BERT:
def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return data

def load_data(data_path):
    # get data, pre-process and split
    data = pd.read_csv(data_path, delimiter='\t', header=None)
    data.columns = ['Sentence', 'Class']
    data['index'] = data.index                                          # add new column index
    columns = ['index', 'Class', 'Sentence']
    data = preprocess_pandas(data, columns)                             # pre-process
    training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
        data['Sentence'].values.astype('U'),
        data['Class'].values.astype('int32'),
        test_size=0.10,
        random_state=0,
        shuffle=True
    )
    return training_data, validation_data, training_labels, validation_labels
    


In [3]:
sequence_classification_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-cased-finetuned-mrpc')
sequence_classification_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased-finetuned-mrpc')


Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main
Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


# Evaluation of pretrained, not finetuned model

First, a pretrained "Bert for sequence classification" model trained on performing binary sentiment analysis on yelp reviews is performed. The creators claim an accuracy of 0.9699 on the original yelp dataset (https://huggingface.co/textattack/bert-base-uncased-yelp-polarity).

Our hypothesis is that this can be transferred to amazon reviews.

In [4]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")

model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity")

training_data, validation_data, training_labels, validation_labels = load_data("amazon_cells_labelled.txt")

#### For SMALL dataset TRAINING data
encoded_inputs = tokenizer(list(training_data), padding=True, truncation=True, return_tensors="pt")
# Perform inference
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    predicted_class_ids_training = logits.argmax(dim=1).tolist()
# evaluation
print("Training data:")
print(sum(training_labels == predicted_class_ids_training)/len(training_labels))

### For SMALL dataset VALIDATION data

encoded_inputs = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")

# Perform inference
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    predicted_class_ids_validation = logits.argmax(dim=1).tolist()
# evaluation
print("Validation data:")
print(sum(validation_labels == predicted_class_ids_validation)/len(validation_labels))

Training data:
0.92
Validation data:
0.93


As can be seen above, the pretrained but not finetuned model already performs surprisingy well with correctly identifying 92 % of the sentiments in the smaller (1000) dataset's training dataset and 93 % of the validation dataset. Some sentences that where misclassified can be found in the following:

In [5]:
print("GT\t Pred.\t Sequence")

for i in range(len(training_labels)):
    if training_labels[i] != predicted_class_ids_training[i]:
        print(training_labels[i], "\t", predicted_class_ids_training[i], "\t", training_data[i])

GT	 Pred.	 Sequence
0 	 1 	 the one big drawback of the mp player is that the buttons on the phone's front cover that let you pause and skip songs lock out after a few seconds.
1 	 0 	 nice sound.
1 	 0 	 it plays louder than any other speaker of this size; the price is so low that most would think the quality is lacking, however, it's not.
1 	 0 	 no shifting, no bubbling, no peeling, not even a scratch, nothing!i couldn't be more happier with my new one for the droid.
1 	 0 	 also its slim enough to fit into my alarm clock docking station without removing the case.
0 	 1 	 when i placed my treo into the case, not only was it not snug, but there was a lot of extra room on the sides.
0 	 1 	 battery lasts only a few hours.
1 	 0 	 gets a signal when other verizon phones won't.
0 	 1 	 this product is very high quality chinese crap!!!!!!
0 	 1 	 lasted one day and then blew up.
0 	 1 	 returned  hours later.
0 	 1 	 it's so stupid to have to keep buying new chargers, car chargers, cradl

For most of the sequences it is obvious why there is a misclassification, such as a short text length; and formulations as "the only thing that disappoints me is..." which is labelled as "negative sentiment" in the GT but implies that the whole review is mostly positive, which is picked up by Bert. Also, the term "China" seems to be associated with negative sentiments which is not true for sentences as "appears to actually outperform the original battery from china that came with my vi."

# Finetuning the model
on the small dataset

In [6]:
training_data, validation_data, training_labels, validation_labels = load_data("amazon_cells_labelled.txt")

tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", num_labels=2)

#### For SMALL dataset TRAINING data
#encoded_training = tokenizer(list(training_data), padding=True, truncation=True, return_tensors="pt")
training_dataset = Dataset.from_dict({"text": training_data, "label": training_labels})
training_dataset = training_dataset.map(tokenize, batched=True)

#encoded_validation = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
validation_dataset = Dataset.from_dict({"text": validation_data, "label": validation_labels})
validation_dataset = validation_dataset.map(tokenize, batched=True)

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="amazon_review_classifier",
    eval_strategy="epoch",
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, 
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Map: 100%|██████████| 900/900 [00:00<00:00, 3308.81 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 3600.48 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.080682,0.96
2,No log,0.113173,0.96
3,No log,0.119065,0.96


TrainOutput(global_step=339, training_loss=0.11766228914964164, metrics={'train_runtime': 118.6846, 'train_samples_per_second': 22.749, 'train_steps_per_second': 2.856, 'total_flos': 710399849472000.0, 'train_loss': 0.11766228914964164, 'epoch': 3.0})

In [7]:
### For SMALL dataset VALIDATION data

encoded_inputs = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
encoded_inputs = encoded_inputs.to("cuda")
# Perform inference
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    predicted_class_ids_validation = logits.argmax(dim=1).tolist()
# evaluation
print("Validation data:")
print(sum(validation_labels == predicted_class_ids_validation)/len(validation_labels))


Validation data:
0.96


As can be seen, even by finetuning on the small dataset, the accuracy on the validation data can be increased to 96%.

In [8]:
for i in range(len(validation_labels)):
    if validation_labels[i] != predicted_class_ids_validation[i]:
        print(validation_labels[i], "\t", predicted_class_ids_validation[i], "\t", validation_data[i])

1 	 0 	 you'll love how thin it is.
0 	 1 	 battery life still not long enough in motorola razor vi.
0 	 1 	 the real killer is the volume, and of course it breaking.
0 	 1 	 the biggest complaint i have is, the battery drains superfast.


The accuracy is really good on the validation dataset, one of the few sequences that are labeled wrong is "the real killer is the volume, and of course it breaking.", which is seen as negative in the GT but as positive by the classifier. It is probably very hard for a language model to detect sarcasm, even more without context.

But the validation dataset only contains 100 sequences, so probably we were just lucky. Let's test the model on the validation part of the big dataset (25000 lines)!

In [9]:
### For LARGE dataset VALIDATION data
training_data, validation_data, training_labels, validation_labels = load_data("amazon_cells_labelled_LARGE_25K.txt")

encoded_inputs = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
encoded_inputs = encoded_inputs.to("cuda")
# Perform inference
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    predicted_class_ids_validation = logits.argmax(dim=1).tolist()
# evaluation
print("Validation data:")
print(sum(validation_labels == predicted_class_ids_validation)/len(validation_labels))

Validation data:
0.922


This does not look much better than the pretrained model. So let's train it on the whole training data of the 25k dataset!

In [15]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", num_labels=2)

#### For SMALL dataset TRAINING data
#encoded_training = tokenizer(list(training_data), padding=True, truncation=True, return_tensors="pt")
training_dataset = Dataset.from_dict({"text": training_data, "label": training_labels})
training_dataset = training_dataset.map(tokenize, batched=True)

#encoded_validation = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
validation_dataset = Dataset.from_dict({"text": validation_data, "label": validation_labels})
validation_dataset = validation_dataset.map(tokenize, batched=True)

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="amazon_review_classifier",
    eval_strategy="epoch",
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, 
    save_strategy="epoch",
)
model = model.to("cuda")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Map: 100%|██████████| 22500/22500 [00:06<00:00, 3537.89 examples/s]
Map: 100%|██████████| 2500/2500 [00:00<00:00, 3579.86 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3218,0.241744,0.9304
2,0.1759,0.249191,0.9364
3,0.1424,0.297812,0.9348


TrainOutput(global_step=8439, training_loss=0.20060898769182933, metrics={'train_runtime': 2568.5533, 'train_samples_per_second': 26.279, 'train_steps_per_second': 3.286, 'total_flos': 1.77599962368e+16, 'train_loss': 0.20060898769182933, 'epoch': 3.0})

In [16]:
### For LARGE dataset VALIDATION data
training_data, validation_data, training_labels, validation_labels = load_data("amazon_cells_labelled_LARGE_25K.txt")

encoded_inputs = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
encoded_inputs = encoded_inputs.to("cuda")
# Perform inference
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    predicted_class_ids_validation = logits.argmax(dim=1).tolist()
# evaluation
print("Validation data:")
print(sum(validation_labels == predicted_class_ids_validation)/len(validation_labels))

Validation data:
0.9304


In [2]:
def preprocess_pandas(data, columns):
    df_ = pd.DataFrame(columns=columns)
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    data['Sentence'] = data['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    data['Sentence'] = data['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters
    data['Sentence'] = data['Sentence'].replace('\d', '', regex=True)                                                   # remove numbers
    for index, row in data.iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_.loc[len(df_)] = {
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent)
        }
    return data

# If this is the primary file that is executed (ie not an import of another file)
def load_data_EXTRALARGE(data_path, max_lines = None, val_size = 0.1):
    if max_lines == None:
        training_xlarge = pd.read_csv(data_path, delimiter=',', header=None)        
    else:
        training_xlarge = pd.read_csv(data_path, delimiter=',', header=None).sample(max_lines, random_state = 0)            # choose a sample of size 200000
    #test_xlarge = pd.read_csv("test.csv", delimiter=',', header=None)
    training_xlarge.columns = [ 'Class', 'Title', 'Sentence']
    training_xlarge['index'] = training_xlarge.index                                          # add new column index
    columns = ['index', 'Class', 'Sentence']
    training_xlarge = preprocess_pandas(training_xlarge, columns)                             # pre-process
    test_size = val_size
    training_data_xl, validation_data_xl, training_labels_xl, validation_labels_xl = train_test_split( # split the data into training, validation, and test splits
        training_xlarge['Sentence'].values.astype('U'),
        training_xlarge['Class'].values.astype('int32'),
        test_size=0.10,
        random_state=0,
        shuffle=True
    )
    training_labels_xl -= 1                       # mapping from {1, 2} to {0, 1}
    validation_labels_xl -= 1
    return training_data_xl, validation_data_xl, training_labels_xl, validation_labels_xl


In [21]:
training_data, validation_data, training_labels, validation_labels = load_data_EXTRALARGE("test.csv", 200000)

tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", num_labels=2)

#### For SMALL dataset TRAINING data
#encoded_training = tokenizer(list(training_data), padding=True, truncation=True, return_tensors="pt")
training_dataset = Dataset.from_dict({"text": training_data, "label": training_labels})
training_dataset = training_dataset.map(tokenize, batched=True)

#encoded_validation = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
validation_dataset = Dataset.from_dict({"text": validation_data, "label": validation_labels})
validation_dataset = validation_dataset.map(tokenize, batched=True)

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="amazon_review_classifier",
    eval_strategy="epoch",
    push_to_hub=False,
)
model = model.to("cuda")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Map: 100%|██████████| 180000/180000 [01:04<00:00, 2785.85 examples/s]
Map: 100%|██████████| 20000/20000 [00:06<00:00, 2909.50 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3024,0.280662,0.91845
2,0.2366,0.303261,0.9247
3,0.1748,0.288119,0.9341


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



TrainOutput(global_step=67500, training_loss=0.26074552216706454, metrics={'train_runtime': 20868.6396, 'train_samples_per_second': 25.876, 'train_steps_per_second': 3.235, 'total_flos': 1.420799698944e+17, 'train_loss': 0.26074552216706454, 'epoch': 3.0})

In [28]:
model.save_pretrained("model_trained_on_EXTRALARGE_dataset.pt")

In [5]:
### For LARGE dataset VALIDATION data
import time
start = time.time()
training_data, validation_data, training_labels, validation_labels = load_data_EXTRALARGE("test.csv", max_lines = 10001, val_size = 10000)
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

model = BertForSequenceClassification.from_pretrained("model_trained_on_EXTRALARGE_dataset").to("cuda")
encoded_inputs = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
encoded_inputs = encoded_inputs.to("cuda")
# Perform inference
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    predicted_class_ids_validation = logits.argmax(dim=1).tolist()
# evaluation
end = time.time()
print("Distribution of the sentiments in the test dataset:")
print("positive: ", sum(validation_labels == 1)/len(validation_labels), "\t negative: ", 1-sum(validation_labels == 1)/len(validation_labels))
print("Accuracy:")
print(sum(validation_labels == predicted_class_ids_validation)/len(validation_labels))
TP = sum( (validation_labels == predicted_class_ids_validation) & (validation_labels == 1) )
FP = sum( (validation_labels != predicted_class_ids_validation) & (validation_labels == 1) )
FN = sum( (validation_labels != predicted_class_ids_validation) & (validation_labels == 0) )
F1 = 2*TP / (2*TP + FP + FN)

print("Precision: \t", TP/(TP+FP))
print("Recall: \t", TP/(TP+FN))
print("F1 Score: \t", F1)

print("with regard to positive sentiments.")
print("Prediction runtime: \t", end - start)

Distribution of the sentiments in the test dataset:
positive:  0.5234765234765235 	 negative:  0.4765234765234765
Accuracy:
0.957042957042957
Precision: 	 0.9656488549618321
Recall: 	 0.9529190207156308
F1 Score: 	 0.9592417061611375
with regard to positive sentiments.
Prediction runtime: 	 87.33383631706238


# But is it really necessary to use an already finetuned model?

Let's try a blank BERT model, and to save time, we will only train on 100,000 lines. 

In [8]:
training_data, validation_data, training_labels, validation_labels = load_data_EXTRALARGE("test.csv", 100000)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
    
#### For SMALL dataset TRAINING data
#encoded_training = tokenizer(list(training_data), padding=True, truncation=True, return_tensors="pt")
training_dataset = Dataset.from_dict({"text": training_data, "label": training_labels})
training_dataset = training_dataset.map(tokenize, batched=True)

#encoded_validation = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
validation_dataset = Dataset.from_dict({"text": validation_data, "label": validation_labels})
validation_dataset = validation_dataset.map(tokenize, batched=True)

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="amazon_review_classifier",
    eval_strategy="epoch",
    push_to_hub=False,
)
model = model.to("cuda")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()
model.save_pretrained("model_trained_on_EXTRALARGE_dataset_not_pretrained")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 90000/90000 [00:33<00:00, 2722.33 examples/s]
Map: 100%|██████████| 10000/10000 [00:03<00:00, 2828.63 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2578,0.261475,0.9287
2,0.1817,0.369631,0.924
3,0.1071,0.299778,0.9384


In [3]:
### For LARGE dataset VALIDATION data
import time
start = time.time()
training_data, validation_data, training_labels, validation_labels = load_data_EXTRALARGE("test.csv", max_lines = 10001, val_size = 10000)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

model = BertForSequenceClassification.from_pretrained("model_trained_on_EXTRALARGE_dataset_not_pretrained").to("cuda")
encoded_inputs = tokenizer(list(validation_data), padding=True, truncation=True, return_tensors="pt")
encoded_inputs = encoded_inputs.to("cuda")
# Perform inference
with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    predicted_class_ids_validation = logits.argmax(dim=1).tolist()
# evaluation
end = time.time()
print("Distribution of the sentiments in the test dataset:")
print("positive: ", sum(validation_labels == 1)/len(validation_labels), "\t negative: ", 1-sum(validation_labels == 1)/len(validation_labels))
print("Accuracy:")
print(sum(validation_labels == predicted_class_ids_validation)/len(validation_labels))
TP = sum( (validation_labels == predicted_class_ids_validation) & (validation_labels == 1) )
FP = sum( (validation_labels != predicted_class_ids_validation) & (validation_labels == 1) )
FN = sum( (validation_labels != predicted_class_ids_validation) & (validation_labels == 0) )
F1 = 2*TP / (2*TP + FP + FN)

print("Precision: \t", TP/(TP+FP))
print("Recall: \t", TP/(TP+FN))
print("F1 Score: \t", F1)

print("with regard to positive sentiments.")
print("Prediction runtime: \t", end - start)

Distribution of the sentiments in the test dataset:
positive:  0.5234765234765235 	 negative:  0.4765234765234765
Accuracy:
0.987012987012987
Precision: 	 0.9923664122137404
Recall: 	 0.9829867674858223
F1 Score: 	 0.9876543209876543
with regard to positive sentiments.
Prediction runtime: 	 79.30403590202332


Turns out that this is **even better**, an accuracy of 99.2 % is reached compared to the 95.7 % of the model that was finetuned on yelp reviews before. It shows that it can be a good idea to check whether a blank top layer performs better.

# Task 3: Comparison:
As can be seen above, tranformer models offer a much higher precision in Language Processing than normal feed forward neural networks. They runtime of the models is also mostly the same.