In [None]:
pip install accelerate -U

In [None]:
pip install transformers[torch]

In [None]:
import torch, os
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
import pandas as pd
reduced_df=pd.read_csv("/kaggle/input/xyz-data/merge_emotions_with_depression.csv")
reduced_df

In [None]:
df = pd.DataFrame(columns=reduced_df.columns)

In [None]:
categories_to_reduce = ['sadness', 'neutral', 'love', 'gratitude', 'disapproval', 'amusement', 'disappointment', 'admiration', 'realization', 'annoyance', 'confusion', 'optimism', 'curiosity', 'excitement', 'caring', 'disgust', 'remorse', 'joy', 'approval', 'embarrassment', 'surprise', 'anger', 'grief', 'pride', 'desire', 'relief', 'fear', 'nervousness', 'depressed', 'not_depressed']

In [None]:
rows_per_category = 5000

for category in categories_to_reduce:
    category_df = reduced_df[reduced_df[category] == 1].head(rows_per_category)
    df = pd.concat([df, category_df], ignore_index=True)

In [None]:
def get_category(row):
    categories = df.columns[1:]  # Exclude the 'text' column
    for category in categories:
        if row[category] == 1:
            return category
    return 'unknown'  # If no category is found, return 'unknown'

# Apply the function to create the 'category' column
df['category'] = df.apply(get_category, axis=1)

In [None]:
sd=df[['text', 'category']]
sd

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
# Define label encoding
labels = df['category'].unique().tolist()
label2id = {label: idx for idx, label in enumerate(labels)}
df["labels"] = df.category.map(lambda x: label2id[x.strip()])

In [None]:
for key, value in enumerate(labels):
    print(value)

In [None]:
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}

In [None]:
sd["labels"]=sd.category.map(lambda x: label2id[x.strip()])
sd.head()

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

In [None]:
SIZE= sd.shape[0]
test_labels=  list(sd.labels[(3*SIZE)//4:])
train_texts = list(sd.text[:SIZE * 2 // 3])
val_texts = list(sd.text[SIZE * 2 // 3:SIZE * 3 // 4])
test_texts = list(sd.text[SIZE * 3 // 4:])

train_labels = list(sd.labels[:SIZE * 2 // 3])
val_labels = list(sd.labels[SIZE * 2 // 3:SIZE * 3 // 4])
test_labels = list(sd.labels[SIZE * 3 // 4:])

In [None]:
len(train_texts), len(val_texts), len(test_texts)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
class DataLoader(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
train_dataloader = DataLoader(train_encodings, train_labels)

val_dataloader = DataLoader(val_encodings, val_labels)

test_dataset = DataLoader(test_encodings, test_labels)

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):

    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(axis=1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds) * 2 

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


In [None]:
import wandb

In [None]:
wandb.init()

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    do_train=True,
    do_eval=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.1, 
    logging_strategy='steps',  # Change to logging by steps
    logging_steps=100,  # Log every 100 steps
    logging_dir='./multi-class-logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Change to saving checkpoints by epoch
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [None]:
def predict(text):

    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)

    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()
 

    # Now map the predicted class index to the actual class label
    # Since pred_label_idx is a tensor containing a single value (the predicted class index),
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label


In [None]:
# Test with an example text
text = "ooooooooooooh my headddd uncle johnny i never should have agreed to work the town election for you when i got that drunk last night ow."
predict(text)


In [None]:
model_path = "/kaggle/working/text-classification-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
model_path = "/kaggle/working/text-classification-model"


model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer= BertTokenizerFast.from_pretrained(model_path)
nlp= pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
nlp("a long flight is made even longer by a seat that won t recline off the red eye and grumpy")

In [None]:
nlp('hello')

In [None]:
nlp('i am want to die')

In [None]:
nlp("Best side quest ever!")

In [None]:
nlp("Oh whoops, I misread the original comment")

In [None]:
nlp('Dear OP: Proper punctuation please/thank you.')

In [None]:
nlp('Weird how the cops can pick and choose which laws to enforce and when, but I guess that explains a lot.')

In [None]:
nlp('Grammar jokes be my favourite')

In [None]:
nlp('What a rollercoaster')

In [None]:
nlp("Copyrights on the individual track designs? Maybe but I'd like to see them")