## STAT8021 / STAT8307
### Assignment 3: Language Modeling with Transformer Basics
### DUE: April 18, 2025, Friday, 11:59 PM

#### Transformer Basics

In [1]:
! pip install transformers datasets evaluate

Collecting transformers
  Downloading transformers-4.50.2-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl.metadata (3.8 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp39-cp39-macosx_12_0_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl.metadata (12 kB)

In [21]:
from datasets import load_dataset, DatasetDict

ag_news_dataset = load_dataset("ag_news")

In [22]:
ag_news_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [23]:
# Just take the first 100 tokens for speed/running on cpu
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:100]),
        'label': example['label']
    }

# Take 1024 random examples for train and 128 validation
small_ag_news_dataset = DatasetDict(
    train=ag_news_dataset['train'].shuffle(seed=1111).select(range(1024)).map(truncate),
    val=ag_news_dataset['test'].shuffle(seed=1111).select(range(128)).map(truncate),
)

In [24]:
small_ag_news_dataset['train'][0]

{'text': 'India and Pakistan balk at bold Kashmir peace plan Pakistani President Pervez Musharraf this week urged steps to end the bitter dispute.',
 'label': 0}

In [25]:
small_ag_news_dataset['val'][0]

{'text': 'Nortel warns of lower Q3 revenue TORONTO - Nortel Networks warned Thursday its third-quarter revenue will be below the \\$2.6 billion US preliminary unaudited revenues it reported for the second quarter.',
 'label': 2}

In [26]:
id2label = {
    0: "World", 
    1: "Sports",
    2: "Business",
    3: "Sci/Tech",
    }

##### Q1 (a)

In [27]:
from transformers import DistilBertTokenizerFast

# ------------------------------------------------------------------------------------------------------------------------------
# Write your code here
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def preprocess_function(token):
    return tokenizer(token["text"], padding="max_length", truncation=True)

small_tokenized_dataset = small_ag_news_dataset.map(preprocess_function, batched=True)

small_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


# print the frist 3 processed samples
small_tokenized_dataset['train'][:3]
# ------------------------------------------------------------------------------------------------------------------------------

{'label': tensor([0, 3, 1]),
 'input_ids': tensor([[ 101, 2634, 1998,  ...,    0,    0,    0],
         [ 101, 3042, 2194,  ...,    0,    0,    0],
         [ 101, 2148, 4420,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

##### Q1 (b)

In [28]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm.notebook import tqdm
import os 
from transformers import DistilBertForSequenceClassification
import torch
import evaluate
from torch.utils.data import DataLoader

# ------------------------------------------------------------------------------------------------------------------------------
# Write your code here

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 3
bsz = 16
lr = 5e-5

train_dataloader = DataLoader(small_tokenized_dataset["train"], batch_size=bsz, shuffle=True)
test_dataloader = DataLoader(small_tokenized_dataset["val"], batch_size=bsz)

# Define your model. optimizer, hyper-parameter and etc.

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
model.to(device)

optimizer = AdamW(model.parameters(), lr=lr)
num_warmup_steps = int(0.1 * num_epochs * len(train_dataloader))

lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, 
                                               num_training_steps=num_epochs * len(train_dataloader))

for epoch in range(num_epochs):
    #train and evaluate your model
    model.train()
    train_correct, train_total = 0, 0
    for batch in tqdm(train_dataloader,desc="Training process:"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        predictions = torch.argmax(logits, dim=-1)
        train_correct += (predictions == batch["labels"]).sum().item()
        train_total += batch["labels"].size(0)

    train_acc = train_correct / train_total

    model.eval()
    test_correct, test_total = 0, 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Testing process:"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits

            # 计算验证准确率
            predictions = torch.argmax(logits, dim=-1)
            test_correct += (predictions == batch["labels"]).sum().item()
            test_total += batch["labels"].size(0)

    test_acc = test_correct / test_total

        
    # print the training process
    print("Epoch {}: train acc = {:.4f}, test acc = {:.4f}".format(epoch + 1, train_acc, test_acc))

# ------------------------------------------------------------------------------------------------------------------------------

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

##### Q1 (c)

In [None]:
chatgpt_generated_news = [
    "In an exciting match last night, the Los Angeles Lakers defeated the Brooklyn Nets 115-110. Lakers' LeBron James made a comeback after missing several games due to injury and scored 25 points while teammate Anthony Davis added 28 points. Nets' star player Kevin Durant scored 32 points but couldn't lead his team to victory.",
    "Scientists have discovered a new species of dinosaur that roamed the earth 80 million years ago. The species, named Almatherium, was found in Uzbekistan and is believed to be an ancestor of the modern-day armadillo. The discovery sheds new light on the evolution of mammals and their relationship with dinosaurs.",
    "The United Nations has called for an immediate ceasefire in Yemen as the country faces a growing humanitarian crisis. The UN's special envoy for Yemen, Martin Griffiths, urged all parties to end the violence and engage in peace talks. The conflict has left millions of Yemenis at risk of famine and disease.",
    "Amazon has announced that it will be opening its first fulfillment center in New Zealand, creating more than 500 new jobs. The center will be located in Auckland and is expected to open in 2022. This move will allow Amazon to expand its operations in the region and improve delivery times for customers.",
]
prediction_label = []

# ------------------------------------------------------------------------------------------------------------------------------
# Write your code here

# test your finetuned model on chatgpt_genreated_news
model.eval()
for news in chatgpt_generated_news:
    inputs = tokenizer(news, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    
    prediction_label.append(predicted_class)


# print the predictions for chatgpt_genreated_news
print(prediction_label)

# ------------------------------------------------------------------------------------------------------------------------------

##### Q1 (d)

In [None]:
# ------------------------------------------------------------------------------------------------------------------------------
# Write your code here

# Define your model. optimizer, hyper-parameter and etc.


for epoch in range(num_epochs):
    #train and evaluate your model

    # print the training process
    print("Epoch {}: train acc = {:.4f}, validation acc = {:.4f}".format(epoch + 1, train_acc, validation_acc))

# ------------------------------------------------------------------------------------------------------------------------------