In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


CUDA available: True
Device: cuda


In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [7]:
!pip install transformers datasets evaluate scikit-learn pandas numpy



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from datasets import Dataset
import evaluate

2025-05-06 14:36:28.139422: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746542188.162727     123 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746542188.169898     123 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
# Load dataset directly from Kaggle
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split into train/test (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [11]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [12]:
df.shape

(50000, 2)

In [13]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True, padding="max_length", max_length=512)

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# ✅ Add labels (important!)
tokenized_train = tokenized_train.add_column("labels", train_dataset["sentiment"])
tokenized_test = tokenized_test.add_column("labels", test_dataset["sentiment"])


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [14]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=2,
    fp16=True,
    eval_strategy="epoch",
    logging_strategy="steps",  # 👈 This is the fix
    logging_steps=50,
    dataloader_num_workers=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.1903,0.304331
2,0.4663,0.83985




TrainOutput(global_step=5000, training_loss=0.33230808258056643, metrics={'train_runtime': 2562.6939, 'train_samples_per_second': 31.217, 'train_steps_per_second': 1.951, 'total_flos': 1.059739189248e+16, 'train_loss': 0.33230808258056643, 'epoch': 2.0})

### Evaluate Fine-tuned Model

In [15]:
predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=-1)
labels = test_df['sentiment'].values

accuracy = accuracy_score(labels, preds)
f1 = f1_score(labels, preds)

print(f"Fine-tuned Model Accuracy: {accuracy:.2%}")
print(f"Fine-tuned Model F1-Score: {f1:.2f}")

Fine-tuned Model Accuracy: 93.77%
Fine-tuned Model F1-Score: 0.94


### Implement Prompt Engineering
#### Zero-shot Classification with DistilBERT

In [20]:
from transformers import pipeline

# Initialize classifier with truncation
classifier = pipeline(
    "text-classification",
    model="distilbert-base-uncased",
    truncation=True,  # Automatically truncate long inputs
    max_length=512    # Explicitly set max length
)

def predict_with_prompt(review):
    prompt = f"Is this movie review positive or negative? Review: {review[:1000]} Answer:"  # Additional truncation
    try:
        result = classifier(prompt)
        return 1 if result[0]['label'] == 'POSITIVE' else 0
    except Exception as e:
        print(f"Error processing review: {e}")
        return 0  # Default to negative if error occurs

# Evaluate on 500 samples
test_sample = test_df.sample(500, random_state=42)
preds_pe = test_sample['review'].apply(predict_with_prompt)
accuracy_pe = accuracy_score(test_sample['sentiment'], preds_pe)

print(f"Prompt Engineering Accuracy: {accuracy_pe:.2%}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Prompt Engineering Accuracy: 51.60%


In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer separately for more control
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", truncation=True, max_length=512)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

def predict_with_prompt(review):
    prompt = f"Is this movie review positive or negative? Review: {review} Answer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    return prediction

# Process in smaller batches
batch_size = 16
preds_pe = []
for i in range(0, len(test_sample), batch_size):
    batch = test_sample.iloc[i:i+batch_size]
    batch_preds = [predict_with_prompt(review) for review in batch['review']]
    preds_pe.extend(batch_preds)

accuracy_pe = accuracy_score(test_sample['sentiment'], preds_pe)
print(f"Prompt Engineering Accuracy: {accuracy_pe:.2%}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prompt Engineering Accuracy: 48.40%


#### Few-shot Prompting

In [25]:
few_shot_prompt = """
Classify the sentiment of these examples:
1. "This movie was great!" → positive
2. "I hated this film." → negative
3. "{review}" → 
"""

def few_shot_predict(review):
    prompt = few_shot_prompt.format(review=review)
    result = classifier(prompt)
    return 1 if 'positive' in result[0]['label'].lower() else 0

### Comparative Analysis
##### Generate Results Table

In [26]:
results = pd.DataFrame({
    "Approach": ["Fine-tuning", "Prompt Engineering"],
    "Accuracy": [accuracy, accuracy_pe],
    "F1-Score": [f1, f1_score(test_sample['sentiment'], preds_pe)],
    "Training Time": ["~30 mins", "0 mins"],
    "Inference Speed": ["Slower", "Faster"]
})

print(results)

             Approach  Accuracy  F1-Score Training Time Inference Speed
0         Fine-tuning    0.9377  0.937954      ~30 mins          Slower
1  Prompt Engineering    0.4840  0.652291        0 mins          Faster


In [33]:
model.save_pretrained("/kaggle/working/finetuned_distilbert")

In [40]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Initialize with base tokenizer but custom model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/finetuned_distilbert")

# Create pipeline
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test prediction
print(classifier("This movie was fantastic!"))

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.5197713375091553}]


In [None]:
from huggingface_hub import HfApi, create_repo
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os

# 1. Set your token directly (replace with your actual token)
os.environ["HF_TOKEN"] = ""  # Important!

# 2. Load your model
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/finetuned_distilbert")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/finetuned_distilbert")

# 3. First create the repository
create_repo(
    repo_id="Ofge/finetuned_distilbert",  # e.g., "Ofge/finetuned_distilbert"
    repo_type="model",
    token=os.environ["HF_TOKEN"],
    exist_ok=True
)

# 4. Now push the model and tokenizer
model.push_to_hub(
    "Ofge/finetuned_distilbert",
    token=os.environ["HF_TOKEN"]
)

tokenizer.push_to_hub(
    "Ofge/finetuned_distilbert",
    token=os.environ["HF_TOKEN"]
)

print("Successfully uploaded to Hugging Face Hub!")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Successfully uploaded to Hugging Face Hub!
