# Imports

In [1]:
import math
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import tiktoken
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, DatasetDict
from torchsummaryX import summary
import wandb
from dataclasses import dataclass
from tqdm import tqdm
import re
from multiprocessing import cpu_count
import random
import gc
import pickle

In [2]:
# set seeds
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
cudnn.deterministic = True
cudnn.benchmark = False
random.seed(42)

# Load Data

In [3]:
datasets_train = load_dataset("Shannnh/hw5-changed", split = 'train')
datasets_val = load_dataset("Shannnh/hw5-changed", split = 'validation')
datasets_test = load_dataset("Shannnh/hw5-changed", split = 'test_ds')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
print(datasets_train[0].keys())
print(len(datasets_train))
print(len(datasets_val))
print(len(datasets_test))

dict_keys(['Classifier', 'Prompt', 'Messages', 'PromptId'])
392632
27664
15434


In [27]:
datasets_train = datasets_train.shuffle(seed=42)
datasets_val = datasets_val.shuffle(seed=42)
datasets_test = datasets_test.shuffle(seed=42)

# Hyperparameters

In [4]:
@dataclass
class IDeaLGPTConfig:

    # General
    batch_size: int = 8 # 16
    gradient_accumulation_steps: int = 4
    num_iters: int = 10000
    eval_iters: int = 3
    eval_interval: int = 1000
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    # device: str = 'cpu'

    # Model
    sequence_length: int = 256
    vocab_size: int = 50257 # gpt2 vocab
    num_blocks: int = 8
    num_heads: int = 8
    embed_dim: int = 512
    dropout: float = 0.1
    bias: bool = False

    # Data
    num_workers: int = 8
    train_test_split: float = 0.8
    SUBSET_PERCENTAGE: float =0.01 # % of OWT to train on, between 0 and 1

    # LR scheduler
    lr: float = 2e-3
    lr_decay: bool = True
    warmup_iters: int = 1000
    min_lr: float = 6e-6

    # optimizer
    weight_decay: float = 1e-1
    grad_clip: float = 1.0


config = IDeaLGPTConfig()
device = config.device
config

IDeaLGPTConfig(batch_size=8, gradient_accumulation_steps=4, num_iters=10000, eval_iters=3, eval_interval=1000, device='cuda', sequence_length=256, vocab_size=50257, num_blocks=8, num_heads=8, embed_dim=512, dropout=0.1, bias=False, num_workers=8, train_test_split=0.8, SUBSET_PERCENTAGE=0.01, lr=0.002, lr_decay=True, warmup_iters=1000, min_lr=6e-06, weight_decay=0.1, grad_clip=1.0)

In [None]:
print(f'Effective batch size = {config.batch_size * config.gradient_accumulation_steps}')

Effective batch size = 32


## Tokenizer - OpenAI tiktoken (changed to GPT2Tokenizer)

In [5]:
#tokenizer = tiktoken.get_encoding("cl100k_base") # gpt4 tokenizer - NOTE: need to change vocab_size in config if used
#tokenizer = tiktoken.encoding_for_model('gpt-2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.encode('hello world')


[31373, 995]

In [6]:
tokenizer.model_max_length = config.sequence_length

In [7]:
# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
print(tokenizer.eos_token_id)

50256


# Model

In [9]:
class Head(nn.Module):
    # def __init__(self, embed_dim, head_size, sequence_length, dropout):
    def __init__(self, config, interim_head_size):
        super().__init__()
        self.embed_dim = config.embed_dim
        self.interim_head_size = interim_head_size # say embed_dim = 32 -> broken into say 4 heads, so this will be 8, to be concated back to 32
        self.key = nn.Linear(config.embed_dim, interim_head_size, bias=config.bias)
        self.query = nn.Linear(config.embed_dim, interim_head_size, bias=config.bias)
        self.value = nn.Linear(config.embed_dim, interim_head_size, bias=config.bias)
        self.register_buffer('tril', torch.tril(torch.ones((config.sequence_length, config.sequence_length))))

        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (b,t,c) -> (b,t,h)
        q = self.query(x) # (b,t,c) -> (b,t,h)
        v = self.value(x) # (b,t,c) -> (b,t,h)
        wei = k @ q.transpose(-2, -1) * self.embed_dim**(-0.5) # (b,t,h) @ (b,h,t) -> (b,t,t)

        wei = wei.masked_fill((self.tril[:T, :T] == 0.), -torch.inf) # type: ignore
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        xbow = wei @ v # (b,t,t) @ (b,t,h) -> (b,t,h)
        return xbow

class MultiHeadAttention(nn.Module):
    # def __init__(self, num_heads, embed_dim, head_size, sequence_length, dropout):
    def __init__(self, config, interim_head_size):
        super().__init__()
        self.head_list = nn.ModuleList([Head(config, interim_head_size) for _ in range(config.num_heads)])
        self.proj = nn.Linear(config.embed_dim, config.embed_dim)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.head_list], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(config.embed_dim, 4*config.embed_dim),
            nn.GELU(),
            nn.Linear(4*config.embed_dim, config.embed_dim),
            nn.Dropout(config.dropout)
        )

    def forward(self, x):
        return self.layers(x)

class Block(nn.Module):
    # def __init__(self, num_heads, embed_dim, sequence_length, dropout):
    def __init__(self, config):
        super().__init__()
        self.interim_head_size = config.embed_dim // config.num_heads
        self.sa = MultiHeadAttention(config, self.interim_head_size)
        self.ff = FeedForward(config)
        self.ln1 = nn.LayerNorm(config.embed_dim)
        self.ln2 = nn.LayerNorm(config.embed_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # communication
        x = x + self.ff(self.ln2(x)) # computation
        return x


class Transformer(torch.nn.Module):
    # def __init__(self, embed_dim, vocab_size, sequence_length, num_heads, num_blocks, dropout):
    def __init__(self, config):
        super().__init__()
        self.sequence_length = config.sequence_length
        self.token_embeddings = torch.nn.Embedding(config.vocab_size, config.embed_dim)
        self.position_embeddings = nn.Embedding(config.sequence_length, config.embed_dim)
        self.block_list = nn.Sequential(*[Block(config)
                                          for _ in range(config.num_blocks)])
        self.final_ln = nn.LayerNorm(config.embed_dim)
        self.lm_head = nn.Linear(config.embed_dim, config.vocab_size)

    def forward(self, ixs, targets=None):
        # ixs: (b,t)
        # targets: (b,t)
        B, T = ixs.shape
        x = self.token_embeddings(ixs) # (b,t,c=embed_dim)
        pos_embeds = self.position_embeddings(torch.arange(T, device=device)) # (t,c=embed_dim)
        x += pos_embeds
        x = self.block_list(x)
        x = self.final_ln(x)
        logits = self.lm_head(x) # (b,t,c=vocab_size)
        if targets is None:
            loss = None
        else:
            mask = (ixs != tokenizer.pad_token_id)  # (b,t), True where not a pad token
            logits = logits.permute(0, 2, 1)  # (b,c,t)

            # Use the mask to filter out loss on padding positions
            # logits are now (b, c, t), targets are (b, t), mask is (b, t)
            # Utilizing .masked_fill to turn pad positions to a very large negative value to ignore them in softmax
            loss = F.cross_entropy(logits, targets, reduction='none')  # (b, t) get loss per token
            loss = (loss * mask).sum() / mask.sum()  # average loss only over non-pad tokens
        return logits, loss

    def generate(self, ixs, max_len):
        """
        ixs: (b,t) - input sequence to start generating from
        max_len: int - maximum length of the generated sequence
        """
        b, t = ixs.shape
        for _ in range(max_len):
            # generation (b, ) next tokens in parallel
            ixs_cond = ixs[:, -self.sequence_length:] # consider only the last sequence_length tokens
            logits, loss = self.forward(ixs_cond) # logits=(b,t,c), loss is ignored
            # get juse the final timestep
            last_logits = logits[:, -1, :] # (b,c)
            # normalize
            last_probs = F.softmax(last_logits, dim=-1) # across c
            next_tokens = torch.multinomial(last_probs, 1) # (b,c) -> (b)
            if (next_tokens == tokenizer.eos_token_id).any():
              break
            ixs = torch.cat((ixs, next_tokens), dim=1) # across t so (b,t) -> (b, t+1)
        return ixs


In [10]:
# model = Transformer(embed_dim, vocab_size, sequence_length, num_heads, num_blocks, dropout).to(device)
model = Transformer(config).to(device)

In [11]:
pretrain_model = Transformer(config).to(device)

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
scaler = torch.cuda.amp.GradScaler(enabled=True)


# for generation
start_ix = torch.zeros((1,1), dtype=torch.long, device=device) # (newline character in a single batch)

# Load models

In [14]:
CKPT_PATH = '/content/hw5/best_fine_tune_model.pth'
ckpt = torch.load(CKPT_PATH)
model.load_state_dict(ckpt)

<All keys matched successfully>

In [15]:
CKPT_PATH = '/content/hw5/best_model.pth'
ckpt = torch.load(CKPT_PATH)
pretrain_model.load_state_dict(ckpt)

<All keys matched successfully>

In [16]:
def generate_text(prompt, max_seq):
    prompt = prompt.strip() # remove leading and ending white spaces - leads to weird things
    # Encode the prompt using the tokenizer

    # chat_template = f"<|system|>\n<|endoftext|>\n<|user|>\n{' '.join(prompt.split()[:100])}<|endoftext|>\n<|assistant|>"
    chat_template = f"<|user|>\n{' '.join(prompt.split()[:100])}<|endoftext|>\n"
    prompt_tokens = tokenizer.encode(chat_template, return_tensors='pt').to(device)

    # Generate text using the model
    generated_tokens = pretrain_model.generate(prompt_tokens, max_seq)

    # Decode the tokens back to text
    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)  # Remove batch dimension

    return generated_text

# Evaluation Metrics

## Sentiment analysis

In [17]:
from transformers import pipeline
judge = pipeline('sentiment-analysis', model="finiteautomata/bertweet-base-sentiment-analysis")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [None]:
judge('negative')

[{'label': 'NEG', 'score': 0.5685986280441284}]

In [None]:
torch.cuda.empty_cache()
gc.collect()

66

In [18]:
sentiment_rows = datasets_test.filter(lambda example: example['Classifier'] == 'SentimentAnalysis')

In [None]:
len(sentiment_rows)

491

In [None]:
sentiment_rows[1]['Messages'][2]['content']

'positive'

In [24]:
words_count = 0
nlu_count = 0
ii = 0
a = '<|assistant|>'
for i in sentiment_rows:
  target = i['Messages'][2]['content']
  q = i['Messages'][1]['content']
  if target == 'positive':
    target1 = 'POS'
  elif target == 'negative':
    target1 = 'NEG'
  else:
    target1 = 'NEU'
  new_sentence = generate_text(q, 100)
  #generate_sentence = new_sentence.split(a)[-1]#for fine-tuned model
  generate_sentence = new_sentence[len(q)+5:len(q)+5+50] # for pre-trained model
  print(f'Prompt: {q}')
  print(f'Generated sentence: {generate_sentence}')
  print(f'Target: {target}')
  print('------------')
  #print(judge(generate_sentence))
  #print(ii)
  nlu_gen = judge(generate_sentence)[0]['label']

  if target1 == nlu_gen:
    nlu_count += 1
  if target in generate_sentence:
    words_count += 1
  ii += 1
  if ii == 2:
    break
acc_with_words = words_count / len(sentiment_rows)
acc_with_nlu = nlu_count / len(sentiment_rows)



Prompt: What's the sentiment of the sentence:
Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004 .
Generated sentence: .
The president told guest Luke Rowley that a pote
Target: positive
------------
Prompt: What's the sentiment of the sentence:
HELSINKI ( AFX ) - Shares closed higher , led by Nokia after it announced plans to team up with Sanyo to manufacture 3G handsets , and by Nokian Tyres after its fourth-quarter earnings report beat analysts ' expectations , dealers said .
Generated sentence: oming in with a television special this week—and m
Target: positive
------------


In [None]:
#to see if there's exactly 'positive','negative' or 'neutral' in generated sentence
acc_with_words


0.4378818737270876

In [None]:
# use Natural Language Understanding tool to analyze the sentiment of generated sentence
acc_with_nlu

0.5641547861507128

In [None]:
#to see if there's exactly 'positive','negative' or 'neutral' in generated sentence
acc_with_words

0.0

In [None]:
# use Natural Language Understanding tool to analyze the sentiment of generated sentence
acc_with_nlu

0.5254582484725051

## NER

In [None]:
# f-1 score for NER
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

In [None]:
ner_rows = datasets_test.filter(lambda example: example['Classifier'] == 'NamedEntity')

In [None]:
print(ner_rows[0])

{'Classifier': 'NamedEntity', 'Prompt': 'Recognize the named entities from the sentence:\nSOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .', 'Messages': [{'content': '', 'role': 'system'}, {'content': 'Recognize the named entities from the sentence:\nSOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .', 'role': 'user'}, {'content': 'JAPAN:B-LOC,CHINA:B-PER', 'role': 'assistant'}], 'PromptId': 'NamedEntity/0'}


In [None]:
ner_rows = ner_rows[:1000]

In [None]:
a=ner_rows[0]['Messages'][2]['content']
entities = a.split(',')#
#keyword, category = entities.split(':')
print(entities)

['JAPAN:B-LOC', 'CHINA:B-PER']


In [None]:
true_key_cat = []
predicted_key_cat = []
a = '<|assistant|>'
ii=0
for i in ner_rows:
  target = i['Messages'][2]['content']
  q = i['Messages'][1]['content']
  entities = target.split(',')
  true_key_cat.append(entities)
  '''
  for entity in entities:
    if ':' in entity:
    #print(ii)
    #print(entity)
      keyword, category = entity.split(':')
      k.append(keyword)
      c.append(category)
      '''
  generate_sentence = generate_text(q, 256).split(a)[-1]#[len(q)+7:]
  #generate_sentence = generate_text(q, 90)[len(q)+7:len(q)+7+50]
  generated = generate_sentence.split(',')
  predicted_key_cat.append(generated)
  '''
  for gen in generated:
    if ':' in gen:
      key, cate = entity.split(':')
      k1.append(key)
      c1.append(cate)
      '''
  print(f'Prompt: {q}')
  print(f'Generated sentence: {generate_sentence}')
  print(f'Target: {target}')
  print('----------------')
  ii += 1
  if ii == 500:
    break
  if ii == 2:
    break



Prompt: Recognize the named entities from the sentence:
SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
Generated sentence: 
India:B-LOC,LUCKY:B-LOC,CHINA:B-LOC
Target: JAPAN:B-LOC,CHINA:B-PER
----------------
Prompt: Recognize the named entities from the sentence:
Nadim Ladki
Generated sentence: 
Nadim:B-PER
Target: Nadim:B-PER,Ladki:I-PER
----------------


In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(true_key_cat + predicted_key_cat)
true_binary = mlb.transform(true_key_cat)
predicted_binary = mlb.transform(predicted_key_cat)

In [None]:
f1_micro = f1_score(true_binary, predicted_binary, average='micro')


In [None]:
f1_micro

0.06988487174308222

In [None]:
f1_micro

0.0009523809523809524