In [109]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
import evaluate
import os
import random

import numpy as np


import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix,
    precision_recall_fscore_support,
    ConfusionMatrixDisplay
)

from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from lightning.pytorch.utilities.combined_loader import CombinedLoader

In [117]:
ldf = pd.read_csv("../week5/week5_qbias_dataset.csv")
ldf.dropna(subset=['raw'], inplace=True)
ldf = ldf[ldf['raw'].str.len() > 0]


In [75]:
RANDOM_SEED = 42
TOKENIZERS_PARALLELISM = False

In [111]:
class sk_multiclass_dataset(Dataset):
    def __init__(self, values, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            values.tolist(),
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        if labels.dtype == object or not np.issubdtype(labels.dtype, np.integer):
            label_encoder = LabelEncoder()
            torch_lables = torch.tensor(self.label_encoder.fit_transform(labels)).long()
            self.label_mapping = {idx: label for idx, label in enumerate(label_encoder.classes_)}
            self.num_classes = len(label_mapping)
        else:
            label_encoder = None
            torch_lables = torch.tensor(labels.values).long()
            self.label_mapping = None
            self.num_classes = len(torch.unique(torch_lables))

        self.X = values
        self.y = torch_lables
        assert self.X.shape[0] == self.y.shape[0]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):    
        # X = torch.from_numpy(self.X[idx].astype(np.int8).todense()).float().squeeze()
        # y = self.y[idx]
        # return X, y

        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.y[idx]
        return item


In [112]:
ldf = ldf.sample(4)
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto")

train_ldf, test_ldf = train_test_split(ldf, test_size=0.2, random_state=42, shuffle=True)

t_l_ldf = sk_multiclass_dataset(train_ldf['raw'], train_ldf['label_left'], tokenizer)
t_r_ldf = sk_multiclass_dataset(train_ldf['raw'], train_ldf['label_right'], tokenizer)
t_c_ldf = sk_multiclass_dataset(train_ldf['raw'], train_ldf['label_center'], tokenizer)

# need to have:
trains = {
    "l": DataLoader(t_l_ldf, batch_size=4, shuffle=True),
    "r": DataLoader(t_r_ldf, batch_size=4, shuffle=True),
    "c": DataLoader(t_c_ldf, batch_size=4, shuffle=True)
}

task_keys = list(trains.keys())

combined_loader = CombinedLoader(trains, 'sequential')
# _ = iter(combined_loader)

# for batch, batch_idx, dataloader_idx in combined_loader:
#     print(f"{batch}, {batch_idx=}, {dataloader_idx=}")

In [113]:
class MultiTask_Network(nn.Module):
    def __init__(self, 
                 input_dim, 
                 tasks,
                 hidden_dim : int = 200):
        
        super().__init__()

        self.input_dim = input_dim
        # self.output_dim_l = output_dim_l
        # self.output_dim_c = output_dim_c
        # self.output_dim_r = output_dim_r
        self.tasks = tasks
        self.hidden_dim = hidden_dim
        
        self.llama = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
        self.hidden = nn.Linear(self.llama.config.hidden_size, self.hidden_dim)
        self.final_r = nn.Linear(self.hidden_dim, self.tasks[1]['output_size'])
        self.final_l = nn.Linear(self.hidden_dim, self.tasks[2]['output_size'])
        self.final_c = nn.Linear(self.hidden_dim, self.tasks[0]['output_size'])
        
        # for task in self.tasks:
        #     if tasks[1]['pretrained']:
        #         self.final_r.load_state_dict(torch.load(classifier_weights["final_r"]))
        #     if task == "l" and tasks[task]['pretrained']:
        #         self.final_l.load_state_dict(torch.load(classifier_weights["final_l"]))
        #     if task == "c" and tasks[task]['pretrained']:
        #         self.final_c.load_state_dict(torch.load(classifier_weights["final_c"]))


        #freeze llama?
        # for param in self.llama.parameters():
        #     param.requires_grad = False

    
    def forward(self, x, task_name : str):

        outputs = self.llama(
            input_ids=x["input_ids"],
            attention_mask=x["attention_mask"]
        )
        iid = x["input_ids"]
        iid2 = x["attention_mask"]
        

        # pooled = outputs.last_hidden_state[:, 0]  #was giving the same tokenization everytime - ig llama doesnt use the cls token?

        last_hidden = outputs.last_hidden_state  # (B, T, H)

        mask = x["attention_mask"].unsqueeze(-1)  # (B, T, 1)
        pooled = (last_hidden * mask).sum(dim=1) / mask.sum(dim=1)

        # if (random.randint(0,10) < 5):
        # decoded = tokenizer.batch_decode(x["input_ids"])
        # print(f"{decoded} got {pooled}") 



        
        x = self.hidden(pooled)
        # print(f"task name is {task_name}")
        # print(x)
        
        #sigmoid? his example uses this but not llama
        # x = torch.sigmoid(x)  
        #think relu is better, although why need activation

        laye = "none"
        if task_name == 'r':
            x = self.final_r(x)
            # x = torch.sigmoid(self.final_r(x))
            laye = "final_r"
        elif task_name == 'l':
            x = self.final_l(x)
            # x = torch.sigmoid(self.final_l(x))
        elif task_name == 'c':
            x = self.final_c(x)
            # x = torch.sigmoid(self.final_c(x))
        else:
            assert False, 'Bad Task ID passed'


        # print(f"Mean: {x.mean().item():.4f}, Min: {x.min().item():.4f}, Max: {x.max().item():.4f}")

           # and got mask {mask}
           #  \n

        # print(f"""
        #     __________________
        #     printing for {task_name} with input_ids {iid} and attn {iid2}
        #     \n
        #     has last hidden as {last_hidden}
        #     \n
 
        #     and got pooled as {pooled}
        #     \n
        #     for a final of {x}
        #     \n 
        #     layer used was {laye}
        #     _______________________
        #     """)

        

        return x

In [107]:
binary_loss = nn.BCEWithLogitsLoss()
multiclass_loss = nn.CrossEntropyLoss()

tasks = [
    # {"name": "c", "output_size" : 2, "loss_func" : nn.CrossEntropyLoss(), "classifier_weights_loc" : None, "pretrained": False },
    {"name": "c", "output_size" : 1, "loss_func" : nn.BCEWithLogitsLoss(), "classifier_weights_loc" : None, "pretrained": False },
    {"name": "r", "output_size" : 1, "loss_func" : nn.BCEWithLogitsLoss(), "classifier_weights_loc" : None, "pretrained": False },
    {"name": "l", "output_size" : 1, "loss_func" : nn.BCEWithLogitsLoss(), "classifier_weights_loc" : None, "pretrained": False }
]

model = MultiTask_Network(128, tasks)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)

In [108]:



for i in range(10): #epochs
    total_loss = 0
    
    for batch, batch_idx, dataloader_idx in combined_loader:
        # print(f"{batch}, {batch_idx=}, {dataloader_idx=}")
        
        preds = model(batch, task_name = tasks[dataloader_idx]['name'])
        curr_loss_func = tasks[dataloader_idx]['loss_func']
        loss = curr_loss_func(preds, batch['labels'].float().unsqueeze(1))
        k = 0
        for p, l in zip(preds.squeeze().tolist(), batch['labels'].float().unsqueeze(1).tolist()):
            print(f"Pred: {p:.4f} | Label: {l}")
            k+=1
            if k > 20:
                break
        print (loss)
        
        
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        
    print(f"Epoch {i+1}, Loss: {total_loss/len(batch)}")



# save_dir = "./multitask_01_sunday_8pm_Fexamples_multiEpoch"
# os.makedirs(save_dir, exist_ok=True)

# torch.save(model.state_dict(), os.path.join(save_dir, "model.pth"))

# # Save tokenizer
# tokenizer.save_pretrained(save_dir)

# print("Model saved successfully!")

Pred: -0.3905 | Label: [0.0]
Pred: 0.1137 | Label: [1.0]
Pred: -0.1498 | Label: [0.0]
tensor(0.5919, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Pred: 0.1383 | Label: [0.0]
Pred: 0.4855 | Label: [0.0]
Pred: -0.0009 | Label: [0.0]
tensor(0.8075, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Pred: 0.1384 | Label: [1.0]
Pred: 0.1757 | Label: [0.0]
Pred: 0.2807 | Label: [1.0]
tensor(0.6579, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Epoch 1, Loss: 0.685784121354421
Pred: -2.4067 | Label: [0.0]
Pred: -2.6486 | Label: [0.0]
Pred: 1.2392 | Label: [1.0]
tensor(0.1363, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Pred: -1.9904 | Label: [0.0]
Pred: -2.2620 | Label: [0.0]
Pred: -3.0193 | Label: [0.0]
tensor(0.0916, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Pred: 1.5636 | Label: [1.0]
Pred: 1.6370 | Label: [1.0]
Pred: -2.0091 | Label: [0.0]
tensor(0.1646, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Epoch 2, Loss: 0.13083790491024652
Pred: -3.0664 | Label: [0.0]
Pred: 

In [118]:
# if not model:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MultiTask_Network(128, tasks)
save_dir = "./multitask_01_sunday_8pm_Fexamples_multiEpoch"
model.load_state_dict(torch.load(os.path.join(save_dir, "model.pth"), map_location=device))
test_ldf = ldf.sample(30).copy()
# train_ldf, test_ldf = train_test_split(ldf, test_size=0.2, random_state=42, shuffle=True)

e_l_ldf = sk_multiclass_dataset(test_ldf['raw'], test_ldf['label_left'], tokenizer)
e_r_ldf = sk_multiclass_dataset(test_ldf['raw'], test_ldf['label_right'], tokenizer)
e_c_ldf = sk_multiclass_dataset(test_ldf['raw'], test_ldf['label_center'], tokenizer)

# need to have:
evals = {
    "l": DataLoader(e_l_ldf, batch_size=4, shuffle=True),
    "r": DataLoader(e_r_ldf, batch_size=4, shuffle=True),
    "c": DataLoader(e_c_ldf, batch_size=4, shuffle=True)
}

task_keys = list(trains.keys())

combined_eval_loader = CombinedLoader(evals, 'sequential')


In [65]:
len(test_ldf.iloc[0, 0])

660

In [120]:

results = []

for batch, batch_idx, dataloader_idx in combined_eval_loader:

    task_name = tasks[dataloader_idx]['name']
    preds = model(batch, task_name)
    preds_np = preds.detach().cpu().numpy().flatten()
    ypreds = torch.sigmoid(torch.tensor(preds_np)).numpy()
    
    labels_np = batch['labels'].detach().cpu().numpy().flatten()

    # pred_classes = (preds_np > 0.5).astype(int)
    probs = torch.sigmoid(torch.tensor(preds_np))
    pred_classes = (probs > 0.5).int().numpy()

    for y_p, y_t, y_c in zip(preds_np, labels_np, pred_classes):
        results.append({
            "task": task_name,
            "y_pred": float(y_p),
            "y_true": int(y_t),
            "pred_class": int(y_c)
        })

df_results = pd.DataFrame(results)
print("test performance")
print(classification_report(df_results['y_true'], df_results['pred_class']))

for t in range(3):
    task_name = task_keys[t]
    subdf = df_results[df_results['task'] == task_name].copy()
    subdf.head()
    print(f"test performance on task {tasks[t]['name']}")
    print(classification_report(subdf['y_true'], subdf['pred_class']))

test performance
              precision    recall  f1-score   support

           0       0.67      1.00      0.80        60
           1       0.00      0.00      0.00        30

    accuracy                           0.67        90
   macro avg       0.33      0.50      0.40        90
weighted avg       0.44      0.67      0.53        90

test performance on task c
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        24
           1       0.00      0.00      0.00         6

    accuracy                           0.80        30
   macro avg       0.40      0.50      0.44        30
weighted avg       0.64      0.80      0.71        30

test performance on task r
              precision    recall  f1-score   support

           0       0.67      1.00      0.80        20
           1       0.00      0.00      0.00        10

    accuracy                           0.67        30
   macro avg       0.33      0.50      0.40        30
wei

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [150]:
subdf = df_results[df_results['task'] == 'l'].copy()
subdf['y_pred'].value_counts()

y_pred
-1.579535    1
-2.522428    1
-2.382221    1
-1.403132    1
-2.432225    1
            ..
-1.794623    1
-1.904278    1
-2.264257    1
-1.843960    1
-1.675348    1
Name: count, Length: 160, dtype: int64

In [195]:
# print(model.final_l.weight)
# preds

for name, param in model.named_parameters():
    # print(name)
    if param.grad is not None:
        print(f"{name} grad norm: {param.grad.norm().item()}")

# model.forward(e_l_ldf[0], 'l')
# outputs = model.llama(
#             input_ids=e_l_ldf[0]["input_ids"],
#             attention_mask=e_l_ldf[0]["attention_mask"]
#         )
# outputs.last_hidden_state[:, 0].std(dim=0)
# # e_l_ldf[0]["input_ids"]

hidden.weight grad norm: 0.49672582745552063
hidden.bias grad norm: 0.014233889058232307
final_l.weight grad norm: 12.833967208862305
final_l.bias grad norm: 0.30220913887023926


In [155]:
df_results['task'].value_counts()

task
c    160
r    160
l    160
Name: count, dtype: int64