In [None]:
#Libraries

import datasets
from typing import Dict, Any, List
import numpy as np
datasets.logging.set_verbosity_error()
import pandas as pd
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset 
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
import evaluate 

#BERT tokenizer (wordpiece), splitting sequence tokens available in the its libary
#   if it sess a word it doesnt know "VRAM" it will split it into "V" "Ra" "M", a double-hash prefix "##RA" added

#(hyperparameters are external configurations, parameters are learned during training (weight), batchsize, learning rate, )


# --- load  Pre-trained model 
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
#What does autotokenizer do? 
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



#--- load dataset
# split = "train" returns only the training side 
# could do {"train": 0.8, "test": 0.2} 
raw_datasets = load_dataset("hugginglearners/amazon-reviews-sentiment-analysis", split = "train")


#--- Data Cleaning
#check nulls 
sum_nulls = sum(r is None for r in raw_datasets["reviewText"])
#print(sum_nulls)

#check how types are not "str" 
sum_notstrings = sum(not isinstance(r,str) for r in raw_datasets['reviewText'])
#print(sum_notstrings)

#bad indices 
bad_indices = [i for i, r in enumerate(raw_datasets["reviewText"]) if not isinstance(r, str) or r is None]
#print("First few bad rows:", bad_indices[:10])

#print a few bad examples to understand
for i in bad_indices[:3]:
    print(i, raw_datasets[i])


#print(raw_datasets.features)

#--- Tokenization example --- 
#Tokenization converts all text into tokens (input_ids)
#Padding ensures all sentences have the same len 
#Truncation makes sure any length beyond "max_length" is cut off. 

sample_texts = raw_datasets["reviewText"][:10]  # list of 10 samples
variable = tokenizer(
                sample_texts,
                truncation=True,
                padding= True            
)

#print(variable["input_ids"]) -- inputs id are numbers from words [I am cool] = [101,23,34]
#print(variable["attention_mask"]) -- indicates which padding should the model attend to, 1 yes, 0 no [1,1,1,0,0,0]



# --- Tokenize whole dataset ---
#to efficiently preprocess data, we use .map()
def tokenize_function(batch: Dict[str, List[Any]]):
    texts = [str(t) if t is not None else "" for t in batch["reviewText"]]
    #padding will add len to the short to have the same length as the longest one 
    return tokenizer(texts, padding="max_length", truncation=True)

#batched = True, process batches of rows for speed
tokenized_datasets = raw_datasets.map(tokenize_function,batched=True)
#print(tokenized_datasets)

# --- Dynamic Padding ---
#performs dynamic padding per batch, not dataset-wide padding. if first batch [46,34,56] it will choose 56, if second batch [23,45,67], it will use 67
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


#Example
#test to see dynamic padding, first we see that the output as various lengths 
samples = raw_datasets['reviewText'][:8]
encoded = tokenizer(samples, padding=False, truncation=False)
lengths = [len(x) for x in encoded["input_ids"]]   # counts tokens
print("Token lengths before padding:",lengths)

#this shows that the padded length is up to 98 since 98 is the largest within this 8 rows 
# Properly show how data_collator pads a batch
batch = data_collator([{"input_ids": i, "attention_mask": [1]*len(i)} for i in encoded["input_ids"]])
print("Batch tensor shapes after padding:", {k: v.shape for k, v in batch.items()})



# --- training configuration ---
# TrainingArguments defines hyperparameters for the Trainer class.
training_args = TrainingArguments(
    output_dir="test-trainer",          # where to save checkpoints
    num_train_epochs=5,                 # training for 5 epochs
    learning_rate=2e-5,                 # standard for BERT fine-tuning
    per_device_train_batch_size=8,      # batch size per GPU/CPU
    per_device_eval_batch_size=8,
    logging_steps=100,                  # logs every 100 steps
    fp16=torch.cuda.is_available(),     # use fp16 only if GPU supports it
    evaluation_strategy="steps",        # use 'steps' for compatibility
    eval_steps=500,                     # evaluate every 500 steps
    save_strategy="steps",              # also save checkpoints every few steps
    save_steps=500,                     # number of steps to save
    logging_dir="./logs",               # folder for TensorBoard logs
    load_best_model_at_end=True         # optional: automatically load best model
)



# --- 8️⃣ Load Model ---
# Load pretrained DistilBERT model for binary classification (positive/negative). what does num_labels do? 
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

#--- Split data into train / test
train = tokenized_datasets.train_test_split(test_size = 0.2)
#print(train)

#logits return the raw outputs from the model before softmax
#labels return the target labels (1,0)
def compute_metrics(eval_preds):
    load_accuracy = evaluate.load("accuracy")
    load_f1 = evaluate.load("f1")

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": load_accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": load_f1.compute(predictions=predictions, references=labels)["f1"]
    }

# --- Define a trainer class ---
# Trainer wraps model, data, and config, similar to an sklearn pipeline
trainer = Trainer(
    model,
    args = training_args,
    compute_metrics = compute_metrics,
    #how do we split the amazon dataset? 
    train_dataset = train['train'],
    eval_dataset = train['test'], 
    tokenizer = tokenizer # we are telling the trainer which tokenizer to use 
)

trainer.train() #run on GPU 

# --- Evaluate Model ---
predictions = trainer.predict(tokenized_datasets["train"])
print(predictions.predictions.shape, predictions.label_ids.shape)







In [None]:
#check nulls for raw_datasets
sum_nulls = sum(r is None for r in raw_datasets["reviewText"])
print(sum_nulls)

#check not strings 
sum_notstrings = sum(not isinstance(r,str) for r in raw_datasets['reviewText'])
print(sum_notstrings)

#bad indices 
bad_indices = [i for i, r in enumerate(raw_datasets["reviewText"]) if not isinstance(r, str) or r is None]
print("First few bad rows:", bad_indices[:10])

for i in bad_indices[:3]:
    print(i, raw_datasets[i])


    #convert IDs back to tokens 
for k, v in enumerate(variable['input_ids']):
    tokens = tokenizer.convert_ids_to_tokens(v)
    print(f"{k}: {v}\n")