In [9]:
import gc
import os
import json
import time
from tqdm.notebook import tqdm
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import Trainer, TrainingArguments,TrainerCallback, EarlyStoppingCallback
from transformers import DataCollatorWithPadding
from datasets import load_dataset, DatasetDict

In [10]:
SEED = 42
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
DATASET_NAME = "stanfordnlp/imdb"
np.random.seed(SEED)
torch.manual_seed(SEED) # if using CPU
torch.cuda.manual_seed(SEED) # if using single-GPU
torch.cuda.manual_seed_all(SEED) # if using multi-GPU
torch.backends.cudnn.deterministic = True # deterministic mode
torch.backends.cudnn.benchmark = False # disable auto-tuner to find the best algorithm to use for your hardware
torch.backends.cuda.matmul.allow_tf32 = True # allow TensorFloat-32 on matmul operations
torch.backends.cudnn.allow_tf32  = True # allow TensorFloat-32 on convolution operations
# torch.autograd.set_detect_anomaly(True) # keep this commented out for speed unless debugging NaN
print("Using device: ", DEVICE)

Using device:  cuda


# Dataset loading

In [11]:
dataset = load_dataset(DATASET_NAME)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [12]:
def truncate_few_shot(example):
    # Few-shot prompt design: providing examples to the LLM
    # Take the first 128 words of the review for better context
    review_segment = " ".join(example['text'].split()[:128])
    prompt = (
        "You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\n"
        "Review: The movie was terrible, boring and too long.\n"
        "Sentiment: NEGATIVE\n\n"
        "Review: Absolutely fantastic! I loved every minute of it.\n"
        "Sentiment: POSITIVE\n\n"
        f"Review: {review_segment}\n"
        "Sentiment:"
    )
    return {'text': prompt, 'label': example['label']}

small_test_dataset = dataset['test'].shuffle(seed=SEED).select(range(250))
print(small_test_dataset)

small_few_shot = small_test_dataset.map(truncate_few_shot)
print(small_few_shot)
print(small_few_shot[:2])
print(f"Test size: {len(small_few_shot)}")

Dataset({
    features: ['text', 'label'],
    num_rows: 250
})
Dataset({
    features: ['text', 'label'],
    num_rows: 250
})
{'text': ["You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\nReview: The movie was terrible, boring and too long.\nSentiment: NEGATIVE\n\nReview: Absolutely fantastic! I loved every minute of it.\nSentiment: POSITIVE\n\nReview: <br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the A

In [13]:
def truncate_zero_shot(example):
    # Zero-shot prompt design: No examples, just instruction
    review_segment = " ".join(example['text'].split()[:128])
    prompt = (
        "You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\n"
        f"Review: {review_segment}\n"
        "Sentiment:"
    )
    return {'text': prompt, 'label': example['label']}

small_zero_shot = small_test_dataset.map(truncate_zero_shot)
print(small_zero_shot)
print(small_zero_shot[:2])
print(f"Test size: {len(small_zero_shot)}")

Dataset({
    features: ['text', 'label'],
    num_rows: 250
})
{'text': ["You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\nReview: <br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the Angry sister; the Runaway sister and the sister in Denial. I recognized the Abusive Husband and why he was there and then the Father, oh oh the Father... all superbly\nSentiment:", "You are a sentiment classifier. Determine 

# Tokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# Set padding token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left" # LLMs usually need left padding for generation
print(tokenizer)



TokenizersBackend(name_or_path='microsoft/Phi-3-mini-4k-instruct', vocab_size=32000, model_max_length=4096, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<|endoftext|>'}, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32003: AddedToken("<|placeholder2|>", rstrip=

# Dataset Preprocessing

In [15]:
def tokenize_fn(examples):
    return tokenizer(examples['text'], truncation=True)

small_few_shot_tokenized = small_few_shot.map(tokenize_fn, batched=True, batch_size=16)
small_few_shot_tokenized = small_few_shot_tokenized.rename_column("label", "labels")
small_few_shot_tokenized = small_few_shot_tokenized.remove_columns(["text"])
small_few_shot_tokenized.set_format("torch")

small_zero_shot_tokenized = small_zero_shot.map(tokenize_fn, batched=True, batch_size=16)
small_zero_shot_tokenized = small_zero_shot_tokenized.rename_column("label", "labels")
small_zero_shot_tokenized = small_zero_shot_tokenized.remove_columns(["text"])
small_zero_shot_tokenized.set_format("torch")

print(small_few_shot_tokenized[0:2])

{'labels': tensor([1, 1]), 'input_ids': [tensor([  887,   526,   263, 19688,   770,  3709, 29889,  5953,   837,   457,
          565,   278,  1494, 14064, 21804,   526,   349,  3267,  1806, 18474,
          470,   405, 11787,  1299, 18474, 29889,    13,    13,  1123,  1493,
        29901,   450, 14064,   471, 16403, 29892,   289,  8253,   322,  2086,
         1472, 29889,    13, 29903,   296,  2073, 29901,   405, 11787,  1299,
        18474,    13,    13,  1123,  1493, 29901,  1976,  2929, 11579, 13568,
         6288, 29991,   306, 18012,  1432, 11015,   310,   372, 29889,    13,
        29903,   296,  2073, 29901,   349,  3267,  1806, 18474,    13,    13,
         1123,  1493, 29901,   529,  1182,  2900, 29966,  1182,  2900, 10401,
          306,  9644,   375,  6021,   368,   364, 14927,   319,   498,   681,
          392,  7255,   690, 29892,   306,  2714,   306,   471,   297,   363,
          385, 22684,   292,  4088, 19530,  5828,   322,   310,  3236,  3375,
         1808,   349,  

# Model Definition

In [16]:
config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
# IMPORTANT: Do not set rope_scaling to None on Phi-3 configs with transformers>=5.
# Doing so will set rope_parameters=None internally and crash the native Phi3 implementation.
# Leave the default as-is; if needed, adjust rope_type explicitly (e.g., to 'linear').
# Example of safe adjustment (commented out):
# if isinstance(config.rope_scaling, dict) and config.rope_scaling.get("rope_type") == "default":
#     config.rope_scaling["rope_type"] = "linear"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    config=config,
    trust_remote_code=False,
    torch_dtype="auto"
).to(DEVICE)

def get_sentiment_batch(batch_input_ids, batch_attention_mask):
    # batch_input_ids and batch_attention_mask are already padded tensors
    inputs = {
        'input_ids': batch_input_ids.to(DEVICE),
        'attention_mask': batch_attention_mask.to(DEVICE)
    }
    input_length = inputs['input_ids'].shape[1]
    with torch.no_grad():
        # return_dict_in_generate=True lets us easily separate input from output
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            pad_token_id=tokenizer.pad_token_id,
            return_dict_in_generate=True,
            output_scores=False
        )
    # Extract only the generated tokens
    generated_tokens = outputs.sequences[:, input_length:]
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    batch_preds = []
    for decoded in decoded_preds:
        prediction_part = decoded.strip().upper()
        if "POSITIVE" in prediction_part:
            batch_preds.append(1)
        elif "NEGATIVE" in prediction_part:
            batch_preds.append(0)
        else:
            # Fallback
            batch_preds.append(0)
    return batch_preds

def evaluate_prompting(dataset_to_eval, batch_size=16):
    preds = []
    labels = []
    # DataCollatorWithPadding handles padding within batches
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # Create a DataLoader for batching
    dataloader = DataLoader(dataset_to_eval, batch_size=batch_size, collate_fn=data_collator, pin_memory=True)
    # Synchronize CUDA so timing reflects only the generation loop (like Trainer.predict)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    start_time = time.perf_counter()
    for batch in tqdm(dataloader):
        batch_input_ids = batch['input_ids']
        batch_attention_mask = batch['attention_mask']
        batch_preds = get_sentiment_batch(batch_input_ids, batch_attention_mask)
        preds.extend(batch_preds)
        labels.extend(batch['labels'].tolist())
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    end_time = time.perf_counter()
    duration = end_time - start_time
    samples_per_second = len(dataset_to_eval) / duration if duration > 0 else 0
    preds = np.array(preds)
    labels = np.array(labels)
    return {
        "accuracy": np.mean(preds == labels),
        "f1": f1_score(labels, preds, average='weighted'),
        "runtime": duration,
        "samples_per_second": samples_per_second
    }

Loading weights:   0%|          | 0/195 [00:00<?, ?it/s]

# Execution

In [17]:
results_few_shot = evaluate_prompting(small_few_shot_tokenized)

  0%|          | 0/16 [00:00<?, ?it/s]

In [18]:
results_zero_shot = evaluate_prompting(small_zero_shot_tokenized)

  0%|          | 0/16 [00:00<?, ?it/s]

# Comparison

Note: Prompting using Phi-3 (3.8B parameters) is significantly more computationally expensive than classic fine-tuning with DistilBERT (~66M parameters) because it involves autoregressive generation of multiple tokens instead of a single forward pass for classification.

In [19]:
print("--- FINAL COMPARISON ---")
print(f"Model used: {MODEL_NAME}")
print(f"{'Metric':<30} | {'Few-Shot':<15} | {'Zero-Shot':<15}")
print("-" * 66)
print(f"{'Accuracy':<30} | {results_few_shot['accuracy']:.4f}  | {results_zero_shot['accuracy']:.4f}")
print(f"{'F1 Score':<30} | {results_few_shot['f1']:.4f}  | {results_zero_shot['f1']:.4f}")
print(f"{'Inference Time (s)':<30} | {results_few_shot['runtime']:.4f}  | {results_zero_shot['runtime']:.4f}")
print(f"{'Inference Speed (samples/s)':<30} | {results_few_shot['samples_per_second']:.4f}  | {results_zero_shot['samples_per_second']:.4f}")

--- FINAL COMPARISON ---
Model used: microsoft/Phi-3-mini-4k-instruct
Metric                         | Few-Shot        | Zero-Shot      
------------------------------------------------------------------
Accuracy                       | 0.8960  | 0.7800
F1 Score                       | 0.8958  | 0.7679
Inference Time (s)             | 19.1458  | 12.2918
Inference Speed (samples/s)    | 13.0577  | 20.3388
