In [None]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("cuDNN Enabled:", torch.backends.cudnn.enabled)
print("cuDNN Version:", torch.backends.cudnn.version())

In [8]:
import pandas as pd
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import datasets
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
import evaluate

In [12]:
device = torch.device("cuda:0")

## Read the training data ##

In [13]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=["sentiment", "tweet_id", "date", "query", "user", "tweet"])

Seperate the data into the training, test and dev st

In [14]:
train_df, temp_df = train_test_split(df, test_size=0.875, random_state=42)

In [15]:
train_df, temp_df = train_test_split(train_df, test_size=0.3, random_state=42)

In [16]:
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

Use the bert model for classification

In [17]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)



Tokenize function to properly tokenize the dataset so that the model can understand

In [18]:
def tokenize_function(examples):
    tokenized_data = tokenizer(examples["tweet"], padding="max_length", truncation=True)
    tokenized_data["labels"] = examples["sentiment"]
    return tokenized_data


Tokenize the data for training

In [19]:
t_df = test_df[['tweet', 'sentiment']]
dataset = Dataset.from_pandas(t_df)
train_df = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [20]:
d_df = dev_df[['tweet', 'sentiment']]
dataset = Dataset.from_pandas(d_df)
dev_df = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Configure LoRa and hand it to the model

In [38]:

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)
model = get_peft_model(model, lora_config)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We want to update based on accuracy, so make a function to compute the accuracy of predictions

In [39]:
metric = evaluate.load("accuracy")

In [40]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Make the arguments for training - more epochs will take more time to compute on home computers

In [41]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",
                                 num_train_epochs=10,)



In [44]:
model = BertForSequenceClassification.from_pretrained('5-epoch-model', num_labels=5)
tokenizer = AutoTokenizer.from_pretrained('5-epoch-model-tokenizer')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=dev_df,
    compute_metrics=compute_metrics,
)

Train the model and save the weights

In [None]:
trainer.train()

In [None]:
model.save_pretrained(".")
tokenizer.save_pretrained(".")

In [None]:
model.save_pretrained("5-epoch-model")
tokenizer.save_pretrained("5-epoch-model-tokenizer")

# precision and recall and f1 scores and confusion matrix #

In [46]:
eval_results = trainer.predict(test_dataset=dev_df)
logits = eval_results.predictions
labels = eval_results.label_ids

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [47]:
predictions = np.argmax(logits, axis=-1)

In [48]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

conf_matrix = confusion_matrix(labels, predictions)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)

Precision: 0.7981
Recall: 0.7981
F1 Score: 0.7981
Confusion Matrix:
[[11854  3050]
 [ 3008 12088]]
