# 1. Setup Data Path & Imports
Here we point to our CSV, then import all the libraries we’ll need:
- `datasets` for easy HuggingFace Dataset handling  
- `transformers` for model/tokenizer/training APIs  
- `sklearn` metrics for evaluation  
- standard tools (NumPy, pandas, pathlib)

In [None]:
!pip install -q --upgrade transformers datasets evaluate scikit-learn pandas torch


In [1]:
# data path
data_path = '/kaggle/input/sentim/processed_sentiment_data.csv'

# Imports
import inspect
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report, accuracy_score
from datasets import Dataset, ClassLabel
from transformers import (
    GPT2TokenizerFast, GPT2ForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding,
    set_seed, pipeline,
)


2025-04-17 03:53:49.195444: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744862029.401296      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744862029.460879      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# 2. Hyper‑parameters & Random Seed
Specify model name, output directory, training epochs, batch size, learning rate,
random seed for reproducibility, and maximum sequence length.

In [2]:
model_name   = "gpt2"
output_dir   = "./sentiment-gpt2"
epochs       = 4
batch_size   = 8
lr           = 2e-5
seed         = 42
max_len      = 128

# set global seed
set_seed(seed)


# 3. Load & Prepare Dataset
- Read CSV and ensure it has `text` & `sentiment` columns  
- Map sentiment strings → integer labels  
- Wrap into a HuggingFace `Dataset` and split into train/test  
- Cast labels to `ClassLabel` for proper handling


In [3]:
def load_dataset(csv_path: Path):
    df = pd.read_csv(csv_path)
    if {"text", "sentiment"} - set(df.columns):
        raise ValueError("CSV needs ‘text’ and ‘sentiment’ cols")
    df["text"] = df["text"].fillna("")
    labels   = sorted(df["sentiment"].unique())
    label2id = {lbl: i for i, lbl in enumerate(labels)}
    id2label = {i: lbl for lbl, i in label2id.items()}
    df["label"] = df["sentiment"].map(label2id)
    return Dataset.from_pandas(df[["text", "label"]]), label2id, id2label

# load
dataset, label2id, id2label = load_dataset(Path(data_path))

# create ClassLabel and cast
class_label = ClassLabel(num_classes=len(label2id), names=list(label2id.keys()))
dataset     = dataset.cast_column("label", class_label)

# train/test split
train_ds, test_ds = dataset.train_test_split(
    test_size=0.2, stratify_by_column="label"
).values()


Casting the dataset:   0%|          | 0/823 [00:00<?, ? examples/s]

# 4. Tokenizer & Model
- Load GPT‐2 tokenizer, set padding token to EOS  
- Load GPT‑2 classification head with correct `num_labels`  
- Resize embeddings and set `pad_token_id`


In [4]:
# tokenizer
tok = GPT2TokenizerFast.from_pretrained(model_name)
tok.pad_token = tok.eos_token

# model
model = GPT2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)
model.resize_token_embeddings(len(tok))
model.config.pad_token_id = tok.pad_token_id


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 5. Tokenization Helper
Define a function to tokenize the `text` field, truncating/padding to `max_len`.


In [5]:
def tokenize(ds):
    return ds.map(
        lambda x: tok(
            x["text"], 
            truncation=True, 
            padding="max_length", 
            max_length=max_len
        ),
        batched=True,
        remove_columns=["text"]
    )

train_tok = tokenize(train_ds)
test_tok  = tokenize(test_ds)


Map:   0%|          | 0/658 [00:00<?, ? examples/s]

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

# 6. Trainer Setup
- Build `TrainingArguments`
- Define simple accuracy metric  
- Instantiate `Trainer`


In [6]:
# base args
ta_kwargs = dict(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=lr,
    num_train_epochs=epochs,
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    seed=seed,
)

# check for eval arg name
sig = inspect.signature(TrainingArguments.__init__)
if "evaluation_strategy" in sig.parameters:
    ta_kwargs["evaluation_strategy"] = "epoch"
else:
    ta_kwargs["eval_strategy"] = "epoch"

training_args = TrainingArguments(**ta_kwargs)

# metrics
def metrics(pred):
    logits, labels = pred
    return {"accuracy": accuracy_score(labels, np.argmax(logits, -1))}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    tokenizer=tok,
    data_collator=DataCollatorWithPadding(tok),
    compute_metrics=metrics,
)


  trainer = Trainer(


# 7. Train & Evaluate
Fit the model, then print overall eval metrics and a detailed classification report.


In [7]:
# train
trainer.train()

# evaluation
print("\n*** Evaluation Metrics ***\n", trainer.evaluate())

# detailed report
preds = np.argmax(trainer.predict(test_tok).predictions, -1)
print("\nClassification Report:\n",
      classification_report(test_tok["label"], preds,
                            target_names=class_label.names))




Epoch,Training Loss,Validation Loss,Accuracy
1,0.9657,0.502568,0.878788
2,0.532,0.489597,0.878788
3,0.4493,0.501029,0.884848
4,0.4091,0.523528,0.878788





*** Evaluation Metrics ***
 {'eval_loss': 0.5010291934013367, 'eval_accuracy': 0.8848484848484849, 'eval_runtime': 1.3151, 'eval_samples_per_second': 125.468, 'eval_steps_per_second': 8.365, 'epoch': 4.0}





Classification Report:
               precision    recall  f1-score   support

     LABEL_0       0.88      1.00      0.94       139
     LABEL_1       1.00      0.47      0.64        15
     LABEL_2       0.00      0.00      0.00        11

    accuracy                           0.88       165
   macro avg       0.63      0.49      0.52       165
weighted avg       0.83      0.88      0.85       165



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 8. Save model & Demo
- Save model + tokenizer to `output_dir`  
- Run a quick pipeline demo on two sample sentences


In [8]:
# save
trainer.save_model(output_dir)
tok.save_pretrained(output_dir)
print(f"\nModel & tokenizer saved to {output_dir}")

# inference demo
clf = pipeline("sentiment-analysis", model=output_dir, tokenizer=output_dir, top_k=1)
for txt in [
    "I absolutely loved this product!",
    "This is the worst experience I’ve ever had."
]:
    print(f"» {txt!r} → {clf(txt)}")



Model & tokenizer saved to ./sentiment-gpt2


Device set to use cuda:0


» 'I absolutely loved this product!' → [[{'label': 'LABEL_0', 'score': 0.9773532152175903}]]
» 'This is the worst experience I’ve ever had.' → [[{'label': 'LABEL_0', 'score': 0.9559566974639893}]]
