In [1]:
!pip install kaggle



In [3]:
!ls -l ~/.kaggle/kaggle.json

-rw-------@ 1 sanyuktatuti  staff  70 Apr  8 17:01 /Users/sanyuktatuti/.kaggle/kaggle.json


In [7]:
!kaggle competitions download -c deep-learning-spring-2025-project-2

In [9]:
!unzip -oq deep-learning-spring-2025-project-2.zip

In [5]:
# List the files in the current directory to verify the data is extracted
import os
print("Files in current directory:", os.listdir())

# Check for the existence of test_unlabelled.pkl
if os.path.exists("test_unlabelled.pkl"):
    print("test_unlabelled.pkl found!")
else:
    print("test_unlabelled.pkl not found. Please check if the data has been downloaded and unzipped correctly.")

Files in current directory: ['deep-learning-spring-2025-project-2.zip', 'Deep-Learning-Project-2(2).ipynb', 'results', 'Deep-Learning-Project-2(1).ipynb', 'test_unlabelled.pkl', 'Deep-Learning-Project-2(3).ipynb', '.ipynb_checkpoints']
test_unlabelled.pkl found!


In [7]:
# =====================================
# 1. Install Required Libraries
# =====================================
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3

import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from transformers import (
    RobertaTokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding, 
    RobertaForSequenceClassification,
    EarlyStoppingCallback,
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, Dataset
import evaluate



In [8]:
# =====================================
# 2. Set Device to Use MPS if Available
# =====================================
def get_device():
    # Prefer MPS for macOS, otherwise use CUDA if available; else use CPU.
    if torch.backends.mps.is_available():
        return torch.device("mps")
    elif torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")

Using device: mps


In [9]:
# =====================================
# 3. Load the AG News Dataset and Preprocess
# =====================================
base_model = 'roberta-base'
dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"Number of labels: {num_labels}")
print(f"Class names: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

Number of labels: 4
Class names: ['World', 'Sports', 'Business', 'Sci/Tech']


In [10]:
from datasets import load_dataset

# Load the AG News dataset (the training split for inspection)
dataset = load_dataset('ag_news', split='train')

# Print the column names
print("Column names:", dataset.column_names)

# Print the dataset features (schema)
print("\nDataset features:")
print(dataset.features)

# Show a few examples from the dataset
print("\nFirst 5 examples:")
for i in range(5):
    print(f"Example {i}: {dataset[i]}")

Column names: ['text', 'label']

Dataset features:
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}

First 5 examples:
Example 0: {'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}
Example 1: {'text': 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.', 'label': 2}
Example 2: {'text': "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.", 'label': 2}
Example 3: {'text': 'Iraq Halts Oil Exports from Ma

In [11]:
# =====================================
# 4. Load Pre-trained RoBERTa Model & Apply LoRA
# =====================================
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    num_labels=num_labels
)
model.to(device)
print("Base model loaded and moved to device.")

# Set up LoRA configuration
peft_config = LoraConfig(
    r=11,                    
    lora_alpha=8,           
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key"],  
    task_type="SEQ_CLS"
)

peft_model = get_peft_model(model, peft_config)
peft_model.to(device)
print("\nPEFT Model with LoRA created. Trainable parameters:")
peft_model.print_trainable_parameters()

trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {trainable_params}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded and moved to device.
'NoneType' object has no attribute 'cadam32bit_grad_fp32'

PEFT Model with LoRA created. Trainable parameters:
trainable params: 999,172 || all params: 125,647,880 || trainable%: 0.7952
Total trainable parameters: 999172


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [12]:
# =====================================
# 5. Create Train and Evaluation Splits
# =====================================
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Train dataset size: 119360
Eval dataset size: 640


In [13]:
# =====================================
# 6. Training Setup with Trainer
# =====================================
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy}

output_dir = "results"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    logging_steps=100,
    learning_rate=9e-4,
    num_train_epochs=2,
    #max_steps=1200,
    load_best_model_at_end=True,            
    metric_for_best_model="eval_loss",        
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_hf",
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant': True}
)

def get_trainer(model):
    return Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]  
    )

trainer = get_trainer(peft_model)

print("\nStarting training...")
train_result = trainer.train()
print("Training complete.")


Starting training...




Step,Training Loss,Validation Loss,Accuracy
100,0.6661,0.373921,0.895312
200,0.3528,0.357309,0.904687
300,0.3497,0.308506,0.898438
400,0.2822,0.335997,0.89375
500,0.3044,0.304992,0.903125
600,0.2792,0.300651,0.901563
700,0.2811,0.277787,0.903125
800,0.2948,0.268388,0.914062
900,0.281,0.265158,0.917188
1000,0.2647,0.269578,0.910937


Training complete.


In [20]:
# =====================================
# 7. Evaluate the Model
# =====================================
print("\nEvaluating on validation set...")
eval_metrics = trainer.evaluate()
print(f"Evaluation metrics: {eval_metrics}")


Evaluating on validation set...


Evaluation metrics: {'eval_loss': 0.20119670033454895, 'eval_accuracy': 0.9375, 'eval_runtime': 9.1448, 'eval_samples_per_second': 69.985, 'eval_steps_per_second': 1.094, 'epoch': 1.0455764075067024}


In [21]:
# =====================================
# 8. Inference on Unlabeled Test Data
# =====================================
def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    from torch.utils.data import DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    current_device = get_device()
    inference_model.to(current_device)
    inference_model.eval()
    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')
    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(current_device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        preds = outputs.logits.argmax(dim=-1).cpu()
        all_predictions.append(preds)
        if labelled:
            references = batch["labels"].cpu()
            metric.add_batch(
                predictions=preds.numpy(),
                references=references.numpy()
            )
    all_predictions = torch.cat(all_predictions, dim=0)
    if labelled:
        eval_metric = metric.compute()
        return eval_metric, all_predictions
    return all_predictions

_, _ = evaluate_model(peft_model, eval_dataset, labelled=True, batch_size=8, data_collator=data_collator)

# -------------------------------------
# Inference on Unlabelled Test Data
# -------------------------------------
print("\nRunning inference on test_unlabelled.pkl...")
unlabelled_obj = pd.read_pickle("test_unlabelled.pkl")
print("Type of unlabelled_obj:", type(unlabelled_obj))

if isinstance(unlabelled_obj, pd.DataFrame):
    unlabelled_dataset = Dataset.from_pandas(unlabelled_obj)
else:
    unlabelled_dataset = unlabelled_obj

# Apply tokenization preprocessing
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])

# Run inference (without expecting labels)
preds = evaluate_model(peft_model, test_dataset, labelled=False, batch_size=8, data_collator=data_collator)

df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  
})

submission_file = os.path.join(output_dir, "inference_output.csv")
df_output.to_csv(submission_file, index=False)
print(f"Inference complete. Predictions saved to {submission_file}")

100%|███████████████████████████████████████████| 80/80 [00:11<00:00,  6.75it/s]



Running inference on test_unlabelled.pkl...
Type of unlabelled_obj: <class 'datasets.arrow_dataset.Dataset'>


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

100%|███████████████████████████████████████| 1000/1000 [01:13<00:00, 13.57it/s]

Inference complete. Predictions saved to results/inference_output.csv



