<h1 style='text-align: center;'> Part B - Tweets Fine-tuning, model training and compression with comparison </h1>
<h3 style='text-align: center;'> Group T, IDs: 316398387 ,318481447</h3>

Based on the preprocessing phase done in the previous section, we will approach this with 2 models, one is **Encoder(only) based** model, and the other will be **Decoder only**
<h6 style='text-align: left;'>similar imports like previous part:</h6>




In [41]:
%%capture 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

import seaborn as sns
from decouple import Config, RepositoryEnv 
import emoji
from langdetect import detect, DetectorFactory
from ftfy import fix_text
import re
import nltk
from nltk.corpus import stopwords
import wandb
from wordcloud import WordCloud
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)
wandb.login(key=Config(RepositoryEnv("./.env")).get('wandb_api_key'));  # If not necessary comment this line of W&B"

In [20]:
df = pd.read_csv("preprocessed_tweets.csv"); df.head();

In [22]:
CLASS_TEXTS = ['extremely negative','negative','neutral','positive','extremely positive']  # order matters
LABEL2ID = {s:i for i,s in enumerate(CLASS_TEXTS)}
ID2LABEL = {i:s for s,i in LABEL2ID.items()}

train_df, tmp = train_test_split(df, test_size=0.3, stratify=df['Sentiment'], random_state=42)
val_df, test_df = train_test_split(tmp, test_size=0.4, stratify=tmp['Sentiment'], random_state=42)
train_df.shape, val_df.shape, test_df.shape

((28609, 2), (7356, 2), (4905, 2))

### Decoder-only: TinlyLLAMA model

We will start with the decoder only model where our samples need to be in a format of instruction. 
we will train the decoder in a generative way and its format will be like: 
"Tweet: {text}\nSentiment: {label_text}" and we will mask the prompt part.

In [25]:
from transformers import AutoTokenizer

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # swap to Mistral/Llama-3 if you have VRAM
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def build_example(text, label_text, max_in=160, max_label=4):
    prompt = f"Tweet: {text}\nSentiment:"
    target = f" {label_text}"
    enc_prompt = tokenizer(prompt, add_special_tokens=False, truncation=True, max_length=max_in)
    enc_label  = tokenizer(target, add_special_tokens=False, truncation=True, max_length=max_label)

    input_ids = enc_prompt['input_ids'] + enc_label['input_ids']
    attn_mask = [1]*len(input_ids)
    labels    = [-100]*len(enc_prompt['input_ids']) + enc_label['input_ids']
    return {'input_ids':input_ids,'attention_mask':attn_mask,'labels':labels}

import datasets

def df_to_hf(df_):
    return datasets.Dataset.from_pandas(df_[['clean_text','Sentiment']])

hf_train = df_to_hf(train_df).map(lambda ex: build_example(ex['clean_text'], ex['Sentiment']), remove_columns=['clean_text','Sentiment'])
hf_val   = df_to_hf(val_df).map(lambda ex: build_example(ex['clean_text'], ex['Sentiment']), remove_columns=['clean_text','Sentiment'])
hf_test  = df_to_hf(test_df).map(lambda ex: build_example(ex['clean_text'], ex['Sentiment']), remove_columns=['clean_text','Sentiment'])

data_collator = lambda batch: tokenizer.pad(batch, return_tensors="pt")


Map:   0%|          | 0/28609 [00:00<?, ? examples/s]

Map:   0%|          | 0/7356 [00:00<?, ? examples/s]

Map:   0%|          | 0/4905 [00:00<?, ? examples/s]

In [32]:
@torch.no_grad() # being used as a decorator instead of with torch.no_grad() knowd method it is a little bit more pythonic 
def score_labels_batch(model, tokenizer, texts, candidate_texts=CLASS_TEXTS, max_in=160, device="cuda"):
    prompt_batch = [f"Tweet: {t}\nSentiment:" for t in texts]
    enc = tokenizer(prompt_batch, padding=True, truncation=True, max_length=max_in, return_tensors="pt").to(device)
    scores = []
    for lab in candidate_texts:
        lab_ids = tokenizer(" "+lab, add_special_tokens=False, return_tensors="pt")['input_ids'][0].to(device)
        # teacher-forced logprob of label tokens
        # run once per label with concatenated inputs
        input_ids = torch.cat([enc.input_ids, lab_ids.repeat(len(texts),1)], dim=1)
        attn_mask = torch.cat([enc.attention_mask, torch.ones((len(texts), lab_ids.size(0)), device=device, dtype=enc.attention_mask.dtype)], dim=1)
        out = model(input_ids=input_ids, attention_mask=attn_mask)
        logits = out.logits[:, -lab_ids.size(0)-1:-1, :]  # logits aligned to predict each label token
        logp = F.log_softmax(logits, dim=-1)
        token_logps = logp.gather(-1, lab_ids.view(1,-1).expand(len(texts),-1).unsqueeze(-1)).squeeze(-1).sum(dim=1)
        scores.append(token_logps)
    scores = torch.stack(scores, dim=1)  # [Batch, num_labels]
    preds = scores.argmax(dim=1).cpu().tolist()
    return [ID2LABEL[i] for i in preds]


In [40]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)

class TweetGenClsDS(torch.utils.data.Dataset):
    def __init__(self, df):
        self.items = [build_example(t, y) for t,y in zip(df['clean_text'], df['Sentiment'])]
    def __len__(self): return len(self.items)
    def __getitem__(self, i): return self.items[i]

train_ds = TweetGenClsDS(train_df); val_ds = TweetGenClsDS(val_df)
collate = data_collator

base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None
)
model = prepare_model_for_kbit_training(base)
model = get_peft_model(model, LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, target_modules=["q_proj","k_proj","v_proj","o_proj"], bias="none", task_type="CAUSAL_LM"))

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn=collate)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=8, shuffle=False, collate_fn=collate)

opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.1)
num_steps = len(train_loader)*2
sched = get_cosine_schedule_with_warmup(opt, int(0.06*num_steps), num_steps)

wandb.init(project="tweets-sentiment", name="decoder_raw_loop")
model.train()
for epoch in range(2):
    for step, batch in enumerate(train_loader, 1):
        batch = {k:v.to(model.device) for k,v in batch.items()}
        out = model(**batch)
        loss = out.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step(); sched.step(); model.zero_grad()
        if step % 50 == 0:
            wandb.log({"train/loss": loss.item(), "epoch": epoch + step/len(train_loader)})
    # quick val loss
    model.eval(); val_loss = 0.0; n=0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k:v.to(model.device) for k,v in batch.items()}
            val_loss += model(**batch).loss.item(); n+=1
    wandb.log({"val/loss": val_loss/max(n,1), "epoch": epoch+1})
    model.train()
# test evaluation
model.eval()
preds = []
truth = test_df['label_text'].tolist()
texts = test_df['clean_text'].tolist()
for i in range(0, len(texts), 64):
    preds += score_labels_batch(model, tokenizer, texts[i:i+64], device=device)

print(classification_report(truth, preds, digits=3, labels=CLASS_TEXTS))
wandb.finish()

def objective_hf(trial):
    lr   = trial.suggest_float("lr", 5e-5, 2e-4, log=True)
    r    = trial.suggest_categorical("lora_r", [8,16,32])
    drop = trial.suggest_float("lora_dropout", 0.0, 0.1)

    base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", quantization_config=bnb_config, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None)
    model = prepare_model_for_kbit_training(base)
    peft_cfg = LoraConfig(r=r, lora_alpha=2*r, lora_dropout=drop, target_modules=["q_proj","k_proj","v_proj","o_proj"], bias="none", task_type="CAUSAL_LM")
    model = get_peft_model(model, peft_cfg)

    args = TrainingArguments(
        output_dir=f"./runs/optuna_{trial.number}",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        learning_rate=lr,
        num_train_epochs=1,
        lr_scheduler_type="cosine",
        warmup_ratio=0.06,
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="no",
        bf16=torch.cuda.is_available(),
        report_to=["wandb"],
        run_name=f"decoder_hf_trial_{trial.number}"
    )
    wandb.init(project="tweets-sentiment", name=f"decoder_hf_trial_{trial.number}", group="optuna_hf", reinit=True)

    trainer = Trainer(model=model, args=args, train_dataset=hf_train, eval_dataset=hf_val, data_collator=data_collator)
    trainer.train()

    # macro-F1 on val via label scoring
    preds=[]; truth = val_df['label_text'].tolist(); texts = val_df['clean_text'].tolist()
    for i in range(0,len(texts),64):
        preds += score_labels_batch(model, tokenizer, texts[i:i+64], device=device)
    macro_f1 = f1_score(truth, preds, average="macro")
    wandb.run.summary["val_macro_f1"] = macro_f1
    wandb.finish()
    return macro_f1

study = optuna.create_study(direction="maximize")
study.optimize(objective_hf, n_trials=10)
print("Best:", study.best_value, study.best_params)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

ImportError: The installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.