# this code ran on google colab, on a standard CPU the execution time will be extremely long

In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tensor = torch.tensor([1, 2, 3])
tensor_on_gpu = tensor.to(device)

In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['data_trustpilot.csv']))
df.head()

In [None]:
df["rating"].value_counts()

In [None]:
# Find the size of the smallest class
smallest_class_size = df['rating'].value_counts().min()

# Separate the classes based on ratings
rating_1 = df[df['rating'] == 1]
rating_3 = df[df['rating'] == 3]
rating_4 = df[df['rating'] == 4]
rating_5 = df[df['rating'] == 5]
other_ratings = df[df['rating'] == 2]

# Downsample the majority classes (ratings 1 and 5) to the smallest class size
rating_1_downsampled = rating_1.sample(n=smallest_class_size, random_state=42)
rating_3_downsampled = rating_3.sample(n=smallest_class_size, random_state=42)
rating_4_downsampled = rating_4.sample(n=smallest_class_size, random_state=42)
rating_5_downsampled = rating_5.sample(n=smallest_class_size, random_state=42)

# Combine back the dataset
df_balanced = pd.concat([rating_1_downsampled, rating_3_downsampled, rating_4_downsampled, rating_5_downsampled, other_ratings])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new class distribution
print(df_balanced['rating'].value_counts())

In [None]:
# readjustment to make it fit in the model
df_balanced['rating'] = df_balanced['rating'] - 1

In [None]:
# Select the column to use as input
input_column = 'text'

# Tokenization and Dataset Preparation
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row[input_column]
        rating = row['rating']
        tokens = self.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'labels': torch.tensor(rating, dtype=torch.long)
        }

# Initialize tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)
model.to(device)

# Prepare Dataset
max_length = 128
dataset = ReviewDataset(df_balanced, tokenizer, max_length)

# Split dataset into train and test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42)
train_dataset = ReviewDataset(train_df, tokenizer, max_length)
test_dataset = ReviewDataset(test_df, tokenizer, max_length)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

# Initialize Trainer
def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(f"Test Results: {results}")

# Save the model to a directory
model_save_path = "/content/drive/MyDrive/finetuned_BERT_unpr_model"
trainer.save_model(model_save_path)

# Save the tokenizer as well
model_save_path_2 = "/content/drive/MyDrive/finetuned_BERT_unpr_tokenizer"
tokenizer.save_pretrained(model_save_path_2)

In [None]:
# print classification report and confusion matrix
predictions = trainer.predict(test_dataset)

predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids


from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(true_labels, predicted_labels))
print(confusion_matrix(true_labels, predicted_labels))