In [1]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("cuDNN Enabled:", torch.backends.cudnn.enabled)
print("cuDNN Version:", torch.backends.cudnn.version())

CUDA Available: True
cuDNN Enabled: True
cuDNN Version: 90100


In [2]:
import pandas as pd
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import datasets
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
import evaluate

In [3]:
import os
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"

In [4]:
device = torch.device("cuda:0")

In [5]:
device

device(type='cuda', index=0)

In [None]:
#device = torch.device("cpu")

## Read the training data ##

In [6]:
df1 = pd.read_csv('1429_1.csv', delimiter=',')

  df1 = pd.read_csv('1429_1.csv', delimiter=',')


In [7]:
df1 = df1[['reviews.rating', 'reviews.text']]

Seperate the data into the training, test and dev st

In [8]:
test_df, dev_df = train_test_split(df1, test_size=0.3, random_state=42)

Use the bert model for classification

In [9]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)



Tokenize function to properly tokenize the dataset so that the model can understand

In [10]:
def tokenize_function(examples):
    tokenized_data = tokenizer(examples["reviews.text"], padding="max_length", truncation=True)
    tokenized_data["labels"] = examples["reviews.rating"]
    return tokenized_data


Tokenize the data for training

In [11]:
print(test_df['reviews.rating'].isna().sum())
test_df.shape

20


(24262, 2)

In [12]:
test_df = test_df.dropna()

In [13]:
test_df.shape

(24242, 2)

In [14]:
test_df['reviews.rating'] = test_df['reviews.rating'].astype(int)
t_df = test_df[["reviews.text", "reviews.rating"]]
dataset = Dataset.from_pandas(t_df)
train_df = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/24242 [00:00<?, ? examples/s]

In [15]:
dev_df = dev_df.dropna()
dev_df['reviews.rating'] = dev_df['reviews.rating'].astype(int)
d_df = dev_df[["reviews.text", "reviews.rating"]]
dataset = Dataset.from_pandas(d_df)
dev_df = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10384 [00:00<?, ? examples/s]

Configure LoRa and hand it to the model

In [16]:

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)
model = get_peft_model(model, lora_config)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We want to update based on accuracy, so make a function to compute the accuracy of predictions

In [17]:
metric = evaluate.load("accuracy")

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Make the arguments for training - more epochs will take more time to compute on home computers

In [19]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",
                                 num_train_epochs=10,)



In [20]:
train_df.shape

(24242, 7)

In [21]:
dev_df.shape

(10384, 7)

In [None]:
train_df


In [None]:
train_df.features

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=dev_df,
    compute_metrics=compute_metrics,
)

Train the model and save the weights

In [None]:
trainer.train()

In [None]:
model.save_pretrained("amazon_reviews-10-epoch")
tokenizer.save_pretrained("amazon_reviews_tokenizer-10-epoch")

# precision and recall and f1 scores and confusion matrix #

In [23]:
model = BertForSequenceClassification.from_pretrained("amazon_reviews-10-epoch", num_labels=6)
tokenizer = AutoTokenizer.from_pretrained("amazon_reviews_tokenizer-10-epoch")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=dev_df,
    compute_metrics=compute_metrics,
)

In [25]:
eval_results = trainer.predict(test_dataset=dev_df)
logits = eval_results.predictions
labels = eval_results.label_ids

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [26]:
predictions = np.argmax(logits, axis=-1)

In [27]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

conf_matrix = confusion_matrix(labels, predictions)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)

Precision: 0.6776
Recall: 0.7256
F1 Score: 0.6918
Confusion Matrix:
[[  16    0   34   55   18]
 [  11    0   35   57   11]
 [   7    0   61  264   95]
 [   7    0   51  799 1677]
 [   2    0   17  508 6659]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
