## STAT8021 / STAT8307
### Assignment 3: Language Modeling with Transformer Basics
### DUE: April 18, 2025, Friday, 11:59 PM

#### Transformer Basics

In [1]:
! pip install transformers datasets evaluate



In [1]:
from datasets import load_dataset, DatasetDict

ag_news_dataset = load_dataset("ag_news")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (C:/Users/23629/.cache/huggingface/datasets/parquet/ag_news-9af2a5926861d22a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 250.01it/s]


In [2]:
ag_news_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
# Just take the first 100 tokens for speed/running on cpu
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:100]),
        'label': example['label']
    }

# Take 1024 random examples for train and 128 validation
small_ag_news_dataset = DatasetDict(
    train=ag_news_dataset['train'].shuffle(seed=1111).select(range(1024)).map(truncate),
    val=ag_news_dataset['test'].shuffle(seed=1111).select(range(128)).map(truncate),
)

Loading cached shuffled indices for dataset at C:\Users\23629\.cache\huggingface\datasets\parquet\ag_news-9af2a5926861d22a\0.0.0\14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7\cache-63a729b30640687c.arrow
Loading cached processed dataset at C:\Users\23629\.cache\huggingface\datasets\parquet\ag_news-9af2a5926861d22a\0.0.0\14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7\cache-4626180bb011e782.arrow
Loading cached shuffled indices for dataset at C:\Users\23629\.cache\huggingface\datasets\parquet\ag_news-9af2a5926861d22a\0.0.0\14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7\cache-387c2fe96c3db8c6.arrow
Loading cached processed dataset at C:\Users\23629\.cache\huggingface\datasets\parquet\ag_news-9af2a5926861d22a\0.0.0\14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7\cache-25bdc38418fb4a58.arrow


In [4]:
small_ag_news_dataset['train'][0]

{'text': 'India and Pakistan balk at bold Kashmir peace plan Pakistani President Pervez Musharraf this week urged steps to end the bitter dispute.',
 'label': 0}

In [5]:
small_ag_news_dataset['val'][0]

{'text': 'Nortel warns of lower Q3 revenue TORONTO - Nortel Networks warned Thursday its third-quarter revenue will be below the \\$2.6 billion US preliminary unaudited revenues it reported for the second quarter.',
 'label': 2}

In [6]:
id2label = {
    0: "World", 
    1: "Sports",
    2: "Business",
    3: "Sci/Tech",
    }

##### Q1 (a)

In [7]:
from transformers import DistilBertTokenizerFast

# ------------------------------------------------------------------------------------------------------------------------------
# Write your code here
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def preprocess_function(token):
    return tokenizer(token["text"], padding="max_length", truncation=True)

small_tokenized_dataset = small_ag_news_dataset.map(preprocess_function, batched=True)

small_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


# print the frist 3 processed samples
small_tokenized_dataset['train'][:3]
# ------------------------------------------------------------------------------------------------------------------------------

Loading cached processed dataset at C:\Users\23629\.cache\huggingface\datasets\parquet\ag_news-9af2a5926861d22a\0.0.0\14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7\cache-660febad63e88154.arrow
Loading cached processed dataset at C:\Users\23629\.cache\huggingface\datasets\parquet\ag_news-9af2a5926861d22a\0.0.0\14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7\cache-d66dcfb19e8d85d1.arrow


{'label': tensor([0, 3, 1]),
 'input_ids': tensor([[ 101, 2634, 1998,  ...,    0,    0,    0],
         [ 101, 3042, 2194,  ...,    0,    0,    0],
         [ 101, 2148, 4420,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

##### Q1 (b)

In [9]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm import tqdm
import os 
from transformers import DistilBertForSequenceClassification
import torch
import evaluate
from torch.utils.data import DataLoader

# ------------------------------------------------------------------------------------------------------------------------------
# Write your code here

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 3
bsz = 8
lr = 5e-5

train_dataloader = DataLoader(small_tokenized_dataset["train"], batch_size=bsz, shuffle=True)
test_dataloader = DataLoader(small_tokenized_dataset["val"], batch_size=bsz)

# Define your model. optimizer, hyper-parameter and etc.

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
model.to(device)

optimizer = AdamW(model.parameters(), lr=lr)
num_warmup_steps = int(0.1 * num_epochs * len(train_dataloader))

lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, 
                                               num_training_steps=num_epochs * len(train_dataloader))

for epoch in range(num_epochs):
    #train and evaluate your model
    model.train()
    train_correct, train_total = 0, 0
    for batch in tqdm(train_dataloader,desc="Training process:"):
        batch = {k: v.to(device) for k, v in batch.items()}
        if 'label' in batch:
            batch['labels'] = batch.pop('label')        
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        predictions = torch.argmax(logits, dim=-1)
        train_correct += (predictions == batch["labels"]).sum().item()
        train_total += batch["labels"].size(0)

    train_acc = train_correct / train_total

    model.eval()
    test_correct, test_total = 0, 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Testing process:"):
            batch = {k: v.to(device) for k, v in batch.items()}
            if 'label' in batch:
                batch['labels'] = batch.pop('label')            
            outputs = model(**batch)
            logits = outputs.logits

            predictions = torch.argmax(logits, dim=-1)
            test_correct += (predictions == batch["labels"]).sum().item()
            test_total += batch["labels"].size(0)

    test_acc = test_correct / test_total

        
    # print the training process
    print("Epoch {}: train acc = {:.4f}, test acc = {:.4f}".format(epoch + 1, train_acc, test_acc))

# ------------------------------------------------------------------------------------------------------------------------------

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.

Epoch 1: train acc = 0.7217, test acc = 0.8750


Training process:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [07:27<00:00,  6.99s/it]
Testing process:: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:27<00:00,  3.44s/it]


Epoch 2: train acc = 0.9209, test acc = 0.8984


Training process:: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [07:28<00:00,  7.00s/it]
Testing process:: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:27<00:00,  3.47s/it]

Epoch 3: train acc = 0.9658, test acc = 0.8906





##### Q1 (c)

In [14]:
chatgpt_generated_news = [
    "In an exciting match last night, the Los Angeles Lakers defeated the Brooklyn Nets 115-110. Lakers' LeBron James made a comeback after missing several games due to injury and scored 25 points while teammate Anthony Davis added 28 points. Nets' star player Kevin Durant scored 32 points but couldn't lead his team to victory.",
    "Scientists have discovered a new species of dinosaur that roamed the earth 80 million years ago. The species, named Almatherium, was found in Uzbekistan and is believed to be an ancestor of the modern-day armadillo. The discovery sheds new light on the evolution of mammals and their relationship with dinosaurs.",
    "The United Nations has called for an immediate ceasefire in Yemen as the country faces a growing humanitarian crisis. The UN's special envoy for Yemen, Martin Griffiths, urged all parties to end the violence and engage in peace talks. The conflict has left millions of Yemenis at risk of famine and disease.",
    "Amazon has announced that it will be opening its first fulfillment center in New Zealand, creating more than 500 new jobs. The center will be located in Auckland and is expected to open in 2022. This move will allow Amazon to expand its operations in the region and improve delivery times for customers.",
]
prediction_label = []

# ------------------------------------------------------------------------------------------------------------------------------
# Write your code here

# test your finetuned model on chatgpt_genreated_news
model.eval()
for news in chatgpt_generated_news:
    inputs = tokenizer(news, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    prediction_label.append(predicted_class)

for ids, prediction_label in enumerate(prediction_label):
    print(f"The class of news '{chatgpt_generated_news[ids]}' is {id2label[prediction_label]} \n")

# ------------------------------------------------------------------------------------------------------------------------------


The class of news 'In an exciting match last night, the Los Angeles Lakers defeated the Brooklyn Nets 115-110. Lakers' LeBron James made a comeback after missing several games due to injury and scored 25 points while teammate Anthony Davis added 28 points. Nets' star player Kevin Durant scored 32 points but couldn't lead his team to victory.' is Sports 

The class of news 'Scientists have discovered a new species of dinosaur that roamed the earth 80 million years ago. The species, named Almatherium, was found in Uzbekistan and is believed to be an ancestor of the modern-day armadillo. The discovery sheds new light on the evolution of mammals and their relationship with dinosaurs.' is Sci/Tech 

The class of news 'The United Nations has called for an immediate ceasefire in Yemen as the country faces a growing humanitarian crisis. The UN's special envoy for Yemen, Martin Griffiths, urged all parties to end the violence and engage in peace talks. The conflict has left millions of Yemenis 

##### Q1 (d)

In [15]:
# ------------------------------------------------------------------------------------------------------------------------------
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast

# Write your code here

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def preprocess_function(token):
    return tokenizer(token["text"], padding="max_length", truncation=True)

small_tokenized_dataset = small_ag_news_dataset.map(preprocess_function, batched=True)

small_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])



# Define your model. optimizer, hyper-parameter and etc.


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 3
bsz = 8
lr = 5e-5

train_dataloader = DataLoader(small_tokenized_dataset["train"], batch_size=bsz, shuffle=True)
test_dataloader = DataLoader(small_tokenized_dataset["val"], batch_size=bsz)

# Define your model. optimizer, hyper-parameter and etc.

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
model.to(device)

optimizer = AdamW(model.parameters(), lr=lr)
num_warmup_steps = int(0.1 * num_epochs * len(train_dataloader))

lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, 
                                               num_training_steps=num_epochs * len(train_dataloader))

for epoch in range(num_epochs):
    #train and evaluate your model
    model.train()
    train_correct, train_total = 0, 0
    for batch in tqdm(train_dataloader,desc="Training process:"):
        batch = {k: v.to(device) for k, v in batch.items()}
        if 'label' in batch:
            batch['labels'] = batch.pop('label')        
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        predictions = torch.argmax(logits, dim=-1)
        train_correct += (predictions == batch["labels"]).sum().item()
        train_total += batch["labels"].size(0)

    train_acc = train_correct / train_total

    model.eval()
    test_correct, test_total = 0, 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Testing process:"):
            batch = {k: v.to(device) for k, v in batch.items()}
            if 'label' in batch:
                batch['labels'] = batch.pop('label')            
            outputs = model(**batch)
            logits = outputs.logits

            predictions = torch.argmax(logits, dim=-1)
            test_correct += (predictions == batch["labels"]).sum().item()
            test_total += batch["labels"].size(0)

    test_acc = test_correct / test_total

        
    # print the training process
    print("Epoch {}: train acc = {:.4f}, test acc = {:.4f}".format(epoch + 1, train_acc, test_acc))

# ------------------------------------------------------------------------------------------------------------------------------

Downloading tokenizer_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 25.0/25.0 [00:00<00:00, 5.00kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading vocab.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 899k/899k [00:00<00:00, 1.16MB/s]
Downloading merges.txt: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 965kB/s]
Downloading tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.36M/1.36M [00:01<00:00, 1.33MB/s]
Dow

Epoch 1: train acc = 0.7285, test acc = 0.8438


Training process:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [15:11<00:00,  7.12s/it]
Testing process:: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:30<00:00,  1.93s/it]


Epoch 2: train acc = 0.9131, test acc = 0.8672


Training process:: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [15:11<00:00,  7.12s/it]
Testing process:: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:30<00:00,  1.89s/it]

Epoch 3: train acc = 0.9492, test acc = 0.9141



