In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

import nbformat as nbf
from pathlib import Path
from copy import deepcopy

# ✅ Update path for your new notebook
in_path = Path("/content/drive/MyDrive/Colab Notebooks/FineTuning LLM Emotion Detection From Text.ipynb")
out_path = in_path.with_name(in_path.stem + "_clean.ipynb")

# Read notebook
with open(in_path, encoding="utf-8") as f:
    nb = nbf.read(f, as_version=4)

# 1) Drop broken widgets metadata
if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

# 2) Strip widget-view outputs (cause GitHub render errors)
for cell in nb.cells:
    if cell.get("outputs"):
        new_outputs = []
        for o in cell["outputs"]:
            d = o.get("data", {})
            if isinstance(d, dict) and "application/vnd.jupyter.widget-view+json" in d:
                continue
            new_outputs.append(o)
        cell["outputs"] = new_outputs
    cell.get("metadata", {}).pop("execution", None)  # optional cleanup

# Save cleaned copy
nbf.write(nb, out_path)
print("✅ Cleaned notebook saved to:", out_path)


# Imports

In [None]:
!pip install transformers datasets sklearn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Uploading Data

In [None]:
train_df = pd.read_csv("train.txt", sep=';', header=None, names=['text', 'label'])
test_df = pd.read_csv("test.txt", sep=';', header=None, names=['text', 'label'])
val_df = pd.read_csv("val.txt", sep=';', header=None, names=['text', 'label'])


coverting text to list and giving them numbers like 0 1 2 3

In [None]:
labels = train_df['label'].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}


applying thos number in real data sets

In [None]:
train_df['label'] = train_df['label'].map(label2id)
val_df['label'] = val_df['label'].map(label2id)
test_df['label'] = test_df['label'].map(label2id)

In [None]:
train_df

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [None]:
print(label2id)

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}


In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
print(train_df.head())

                                                text  label
0                            i didnt feel humiliated      0
1  i can go from feeling so hopeless to so damned...      0
2   im grabbing a minute to post i feel greedy wrong      1
3  i am ever feeling nostalgic about the fireplac...      2
4                               i am feeling grouchy      1


#Tokenization

🧠 What is Tokenization?
LLMs like BERT don’t understand plain text like:


"I feel happy today"


They need it converted into numbers (tokens) that represent words or subwords. This process is called tokenization.

We use a special tool for that: a tokenizer — and Hugging Face provides one ready for BERT.



In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Load Pretrained BERT Model

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(

                                                           model_name,
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id

)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**🔍 What’s Happening Here?**



🔹 AutoModelForSequenceClassification:
This loads a BERT model with a final layer designed for classification tasks (like emotion detection).

It will return probabilities for each emotion (joy, anger, etc.).

🔹 num_labels=len(labels):
If you have 6 emotions (joy, sadness, fear, etc.), it tells the model: “Output 6 probabilities.”



# Define Evaluation Metric (Accuracy)

In [None]:
def compute_matrix(p):
  preds = np.argmax(p.predictions, axis = 1)
  return {
      "accuracy": accuracy_score(p.label_ids,preds)
  }

#  Set Up Training Arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",             # where to save model and checkpoints
    eval_strategy="epoch",              # run evaluation after each epoch
    save_strategy="epoch",              # save model after each epoch
    learning_rate=2e-5,                 # how fast the model learns (2e-5 = 0.00002)
    per_device_train_batch_size=16,     # number of examples in one training batch
    per_device_eval_batch_size=16,      # number of examples in one evaluation batch
    num_train_epochs=3,                 # how many times to go through the full dataset
    weight_decay=0.01,                  # regularization to prevent overfitting
    logging_dir="./logs",               # where to save training logs
    logging_steps=10,                   # log every 10 steps
    load_best_model_at_end=True,        # restore best model based on eval metric
    metric_for_best_model="accuracy"    # choose best model by accuracy
)

#  Train Your BERT Model!

In [None]:
from transformers import Trainer

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_matrix,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mreiki2607[0m ([33mreiki2607-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# Evaluate Your Model on Test Data

In [None]:
results = trainer.evaluate(test_dataset)
print("Test Accuracy:", results["eval_accuracy"])


# See a Detailed Classification Report

In [None]:
from sklearn.metrics import classification_report
import numpy as np

predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

print(classification_report(test_dataset["label"], pred_labels, target_names=labels))
