In [35]:
import gc
import os
import json
import time
from tqdm.notebook import tqdm
import torch
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import Trainer, TrainingArguments,TrainerCallback, EarlyStoppingCallback
from transformers import DataCollatorWithPadding
from datasets import load_dataset, DatasetDict

In [36]:
SEED = 42
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
DATASET_NAME = "stanfordnlp/imdb"
np.random.seed(SEED)
torch.manual_seed(SEED) # if using CPU
torch.cuda.manual_seed(SEED) # if using single-GPU
torch.cuda.manual_seed_all(SEED) # if using multi-GPU
torch.backends.cudnn.deterministic = True # deterministic mode
torch.backends.cudnn.benchmark = False # disable auto-tuner to find the best algorithm to use for your hardware
torch.backends.cuda.matmul.allow_tf32 = True # allow TensorFloat-32 on matmul operations
torch.backends.cudnn.allow_tf32  = True # allow TensorFloat-32 on convolution operations
# torch.autograd.set_detect_anomaly(True) # keep this commented out for speed unless debugging NaN
print("Using device: ", DEVICE)

Using device:  cuda


# Dataset loading

In [37]:
dataset = load_dataset(DATASET_NAME)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [38]:
##%% Prompt Strategies
def truncate_few_shot(example):
    # Few-shot prompt design: providing examples to the LLM
    # Take the first 50 words of the review to keep the prompt short
    review_segment = " ".join(example['text'].split()[:50])
    prompt = (
        "You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\n"
        "Review: The movie was terrible, boring and too long.\n"
        "Sentiment: NEGATIVE\n\n"
        "Review: Absolutely fantastic! I loved every minute of it.\n"
        "Sentiment: POSITIVE\n\n"
        f"Review: {review_segment}\n"
        "Sentiment:"
    )
    return {'text': prompt, 'label': example['label']}

small_few_shot = dataset['train'].shuffle(seed=SEED).select(range(128, 160)).map(truncate_few_shot)
print(small_few_shot)
print(small_few_shot[:10])
print(f"Test size: {len(small_few_shot)}")

Dataset({
    features: ['text', 'label'],
    num_rows: 32
})
{'text': ["You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\nReview: The movie was terrible, boring and too long.\nSentiment: NEGATIVE\n\nReview: Absolutely fantastic! I loved every minute of it.\nSentiment: POSITIVE\n\nReview: Thomas Clay has been mixing with the wrong types. That's the trouble with young people these days, they have no respect.<br /><br />Seriously this film should be avoided at all costs. The action in the main body of the film is slow and rather stodgy and ambles to the drug\nSentiment:", "You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\nReview: The movie was terrible, boring and too long.\nSentiment: NEGATIVE\n\nReview: Absolutely fantastic! I loved every minute of it.\nSentiment: POSITIVE\n\nReview: Emily Watson's Natalia is absolutely the most loving and romantic lead character I have ever seen on

In [39]:
def truncate_zero_shot(example):
    # Zero-shot prompt design: No examples, just instruction
    review_segment = " ".join(example['text'].split()[:50])
    prompt = (
        "You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\n"
        f"Review: {review_segment}\n"
        "Sentiment:"
    )
    return {'text': prompt, 'label': example['label']}

small_zero_shot = dataset['train'].shuffle(seed=SEED).select(range(128, 160)).map(truncate_zero_shot)
print(small_zero_shot)
print(small_zero_shot[:10])
print(f"Test size: {len(small_zero_shot)}")

Dataset({
    features: ['text', 'label'],
    num_rows: 32
})
{'text': ["You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\nReview: Thomas Clay has been mixing with the wrong types. That's the trouble with young people these days, they have no respect.<br /><br />Seriously this film should be avoided at all costs. The action in the main body of the film is slow and rather stodgy and ambles to the drug\nSentiment:", "You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\nReview: Emily Watson's Natalia is absolutely the most loving and romantic lead character I have ever seen on a screen. She is the queen of this film beyond all doubt. Or, is she transmuted to the king? The internecine weaving of the chess games and the families' struggles for control, power,\nSentiment:", "You are a sentiment classifier. Determine if the following movie reviews are POSITIVE or NEGATIVE.\n\nReview: This apo

# Tokenizer

In [40]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=False)
print(tokenizer)

TokenizersBackend(name_or_path='microsoft/Phi-3-mini-4k-instruct', vocab_size=32000, model_max_length=4096, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<|endoftext|>'}, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32003: AddedToken("<|placeholder2|>", rstrip=

# Dataset preprocessing

In [41]:
# Preprocessing is not strictly needed for direct prompting as we iterate over the dataset
# but we keep it here if needed for future use or just skip it.
# We will use the un-tokenized datasets `small_few_shot` and `small_zero_shot` directly in `evaluate_prompting`.

# Model Definition

In [42]:
config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=False)
# IMPORTANT: Do not set rope_scaling to None on Phi-3 configs with transformers>=5.
# Doing so will set rope_parameters=None internally and crash the native Phi3 implementation.
# Leave the default as-is; if needed, adjust rope_type explicitly (e.g., to 'linear').
# Example of safe adjustment (commented out):
# if isinstance(config.rope_scaling, dict) and config.rope_scaling.get("rope_type") == "default":
#     config.rope_scaling["rope_type"] = "linear"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    config=config,
    trust_remote_code=False, 
    torch_dtype="auto", 
    device_map="auto"
)

def get_sentiment(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=5)
    
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the part after "Sentiment:"
    prediction_part = decoded[len(prompt):].strip().upper()
    
    if "POSITIVE" in prediction_part:
        return 1
    elif "NEGATIVE" in prediction_part:
        return 0
    else:
        # Fallback/heuristic if the model doesn't output exactly what we want
        return 0 

def evaluate_prompting(dataset_to_eval):
    preds = []
    labels = []
    start_time = time.perf_counter()
    for example in tqdm(dataset_to_eval):
        preds.append(get_sentiment(example['text']))
        labels.append(example['label'])
    end_time = time.perf_counter()
    
    duration = end_time - start_time
    samples_per_second = len(dataset_to_eval) / duration if duration > 0 else 0
    
    preds = np.array(preds)
    labels = np.array(labels)
    
    return {
        "accuracy": np.mean(preds == labels),
        "f1": f1_score(labels, preds, average='weighted'),
        "runtime": duration,
        "samples_per_second": samples_per_second
    }

Loading weights:   0%|          | 0/195 [00:00<?, ?it/s]

# Execution

In [43]:
results_few_shot = evaluate_prompting(small_few_shot)

  0%|          | 0/32 [00:00<?, ?it/s]

In [44]:
results_zero_shot = evaluate_prompting(small_zero_shot)

  0%|          | 0/32 [00:00<?, ?it/s]

# Comparison

Note: Prompting using Phi-3 (3.8B parameters) is significantly more computationally expensive than classic fine-tuning with DistilBERT (~66M parameters) because it involves autoregressive generation of multiple tokens instead of a single forward pass for classification.

In [None]:
print("--- FINAL COMPARISON ---")
print(f"Model used: {MODEL_NAME}")
print(f"{'Metric':<30} | {'Few-Shot':<15} | {'Zero-Shot':<15}")
print("-" * 66)
print(f"{'Accuracy':<30} | {results_few_shot['accuracy']:.4f}  | {results_zero_shot['accuracy']:.4f}")
print(f"{'F1 Score':<30} | {results_few_shot['f1']:.4f}  | {results_zero_shot['f1']:.4f}")
print(f"{'Inference Time (s)':<30} | {results_few_shot['runtime']:.4f}  | {results_zero_shot['runtime']:.4f}")
print(f"{'Inference Speed (samples/s)':<30} | {results_few_shot['samples_per_second']:.4f}  | {results_zero_shot['samples_per_second']:.4f}")