In [3]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW # Changed import to use torch.optim.AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install transformers



In [6]:
!wget -O Final-dataset.csv "https://www.dropbox.com/scl/fi/v7olloa8to9ixjvp3my2l/Final-dataset.csv?rlkey=zk1aasrpcaop79cfgogufs76q&st=jf24beip&dl=0"


--2026-01-16 18:24:18--  https://www.dropbox.com/scl/fi/v7olloa8to9ixjvp3my2l/Final-dataset.csv?rlkey=zk1aasrpcaop79cfgogufs76q&st=jf24beip&dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.2.18, 2620:100:6021:18::a27d:4112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.2.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uccc398edb3f9e9ef4edb5e12e23.dl.dropboxusercontent.com/cd/0/inline/C5H3FgfdtXfEg1YDoPG3VI05rruK4v_Y24xPJdaifaFOzU3ECBuYf2uB36vHaPnfTler99nOSijJpM_u6JvousADGATqwy9L2Bplbsj0ejDBfY6PfZ3tmkuUeP41DQThwLncDkRZ9fTfuXq-v-bN5c-p/file# [following]
--2026-01-16 18:24:19--  https://uccc398edb3f9e9ef4edb5e12e23.dl.dropboxusercontent.com/cd/0/inline/C5H3FgfdtXfEg1YDoPG3VI05rruK4v_Y24xPJdaifaFOzU3ECBuYf2uB36vHaPnfTler99nOSijJpM_u6JvousADGATqwy9L2Bplbsj0ejDBfY6PfZ3tmkuUeP41DQThwLncDkRZ9fTfuXq-v-bN5c-p/file
Resolving uccc398edb3f9e9ef4edb5e12e23.dl.dropboxusercontent.com (uccc398edb3f9e9ef4edb5e12e23.dl.dropboxusercontent.com).

In [None]:
torch.manual_seed(42)
np.random.seed(42)

In [None]:
df = pd.read_csv('/content/Final-dataset.csv')

In [None]:
print(df.head())

                                 sentence  label
0  please kalke assignment ta submit koro      1
1            vai eita ektu check kore dio      1
2                        urgent kotha ase      1
3             client already wait kortese      1
4     oi link ta open korish na virus ase      1


In [None]:
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
print(shuffled_df.head())

                                     sentence  label
0                         Math ta bujhiye de.      1
1                 Please amake ektu help kor.      1
2    ei report ta ami download korte parbo na      0
3  dude joldi koire felo na hole problem hobe      1
4                     exam er por ghure ashbo      0


In [None]:
df = shuffled_df.copy()

In [None]:
# data splitting

from sklearn.model_selection import train_test_split

# Step 1: Split into train (70%) and temp (30%)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['sentence'].tolist(),
    df['label'].tolist(),
    test_size=0.3,
    stratify=df['label'],
    random_state=42,

)

# Step 2: Split temp into validation (15%) and test (15%)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    stratify=train_val_labels,
    random_state=42,
)

In [None]:
print(f"\nTrain set: {len(train_texts)} samples")
print(f"Validation set: {len(val_texts)} samples")
print(f"Validation set: {len(test_texts)} samples")


Train set: 6157 samples
Validation set: 1320 samples
Validation set: 1320 samples


In [None]:
# tokenization, and batching during training


class ImperativeDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Teacher Model (Large - better accuracy)
teacher_model_name = 'bert-base-uncased'
print(f"\nTeacher Model: {teacher_model_name}")
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_name,
    num_labels=2
).to(device)

# Student Model (Small - efficient)
student_model_name = 'prajjwal1/bert-tiny'
print(f"Student Model: {student_model_name}")
student_tokenizer = AutoTokenizer.from_pretrained(student_model_name)
student_model = AutoModelForSequenceClassification.from_pretrained(
    student_model_name,
    num_labels=2
).to(device)

print(f"\nTeacher parameters: {sum(p.numel() for p in teacher_model.parameters()):,}")
print(f"Student parameters: {sum(p.numel() for p in student_model.parameters()):,}")
print(f"Compression ratio: {sum(p.numel() for p in teacher_model.parameters()) / sum(p.numel() for p in student_model.parameters()):.2f}x")

Using device: cuda

Teacher Model: bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Student Model: prajjwal1/bert-tiny


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Teacher parameters: 109,483,778
Student parameters: 4,386,178
Compression ratio: 24.96x


In [None]:
print("\nWhy: The teacher needs to be trained first so it can provide 'soft labels'")
print("(probability distributions) that contain richer information than hard labels.\n")

# Create datasets and dataloaders for teacher
teacher_train_dataset = ImperativeDataset(train_texts, train_labels, teacher_tokenizer)
teacher_val_dataset = ImperativeDataset(val_texts, val_labels, teacher_tokenizer)
teacher_test_dataset = ImperativeDataset(test_texts, test_labels, teacher_tokenizer)

teacher_train_loader = DataLoader(teacher_train_dataset, batch_size=16, shuffle=True)
teacher_val_loader = DataLoader(teacher_val_dataset, batch_size=16, shuffle=False)
teacher_test_loader = DataLoader(teacher_test_dataset, batch_size=16, shuffle=False)

# Training configuration for teacher
teacher_epochs = 5
teacher_optimizer = AdamW(teacher_model.parameters(), lr=2e-5)
teacher_scheduler = get_linear_schedule_with_warmup(
    teacher_optimizer,
    num_warmup_steps=0,
    num_training_steps=len(teacher_train_loader) * teacher_epochs
)

def train_teacher(model, train_loader, optimizer, scheduler, device):
    """Train the teacher model with standard cross-entropy loss."""
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training Teacher"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

def evaluate_model(model, val_loader, device):
    """Evaluate model accuracy."""
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, predictions, true_labels

# Train teacher model
print("\nTraining teacher model...")
for epoch in range(teacher_epochs):
    train_loss = train_teacher(teacher_model, teacher_train_loader, teacher_optimizer, teacher_scheduler, device)
    val_accuracy, _, _ = evaluate_model(teacher_model, teacher_val_loader, device)
    print(f"Epoch {epoch+1}/{teacher_epochs} - Loss: {train_loss:.4f} - Val Accuracy: {val_accuracy:.4f}")

print("\nTeacher model training completed!")


Why: The teacher needs to be trained first so it can provide 'soft labels'
(probability distributions) that contain richer information than hard labels.


Training teacher model...


Training Teacher: 100%|██████████| 385/385 [02:18<00:00,  2.79it/s]
Evaluating: 100%|██████████| 83/83 [00:09<00:00,  8.40it/s]


Epoch 1/5 - Loss: 0.2009 - Val Accuracy: 0.9682


Training Teacher: 100%|██████████| 385/385 [02:14<00:00,  2.87it/s]
Evaluating: 100%|██████████| 83/83 [00:09<00:00,  8.83it/s]


Epoch 2/5 - Loss: 0.0557 - Val Accuracy: 0.9727


Training Teacher: 100%|██████████| 385/385 [02:13<00:00,  2.88it/s]
Evaluating: 100%|██████████| 83/83 [00:09<00:00,  8.81it/s]


Epoch 3/5 - Loss: 0.0311 - Val Accuracy: 0.9735


Training Teacher: 100%|██████████| 385/385 [02:11<00:00,  2.93it/s]
Evaluating: 100%|██████████| 83/83 [00:09<00:00,  8.90it/s]


Epoch 4/5 - Loss: 0.0155 - Val Accuracy: 0.9712


Training Teacher: 100%|██████████| 385/385 [02:11<00:00,  2.92it/s]
Evaluating: 100%|██████████| 83/83 [00:09<00:00,  8.89it/s]

Epoch 5/5 - Loss: 0.0091 - Val Accuracy: 0.9705

Teacher model training completed!





In [None]:
print("STEP 5: DEFINING DISTILLATION LOSS")
print("="*80)
print("\nWhy: Knowledge distillation uses a special loss that combines:")
print("1. Soft targets from teacher (captures uncertainty and relationships)")
print("2. Hard labels from ground truth (ensures correctness)")
print("3. Temperature parameter (controls softness of probability distribution)\n")

class DistillationLoss(nn.Module):
    """
    Knowledge Distillation Loss combines:
    - KL divergence between student and teacher soft predictions
    - Cross-entropy loss with true labels

    Temperature (T): Higher T makes probabilities softer, revealing more
    information about what the teacher learned.

    Alpha: Balance between learning from teacher vs. learning from labels
    """
    def __init__(self, temperature=3.0, alpha=0.7):
        super().__init__()
        self.temperature = temperature
        self.alpha = alpha
        self.kl_div = nn.KLDivLoss(reduction='batchmean')
        self.ce_loss = nn.CrossEntropyLoss()

    def forward(self, student_logits, teacher_logits, labels):
        # Soft targets: Apply temperature to soften probability distributions
        # Why temperature? It makes the teacher's "confidence" more informative
        soft_student = F.log_softmax(student_logits / self.temperature, dim=1)
        soft_teacher = F.softmax(teacher_logits / self.temperature, dim=1)

        # Distillation loss: How well student mimics teacher's soft predictions
        distillation_loss = self.kl_div(soft_student, soft_teacher) * (self.temperature ** 2)

        # Student loss: Standard cross-entropy with true labels
        student_loss = self.ce_loss(student_logits, labels)

        # Combined loss: Weighted average
        # alpha controls balance: higher alpha = learn more from teacher
        total_loss = self.alpha * distillation_loss + (1 - self.alpha) * student_loss

        return total_loss, distillation_loss, student_loss

print("Distillation Loss Configuration:")
print(f"  Temperature: 3.0 (softens probabilities for richer knowledge transfer)")
print(f"  Alpha: 0.7 (70% teacher knowledge, 30% ground truth)")

STEP 5: DEFINING DISTILLATION LOSS

Why: Knowledge distillation uses a special loss that combines:
1. Soft targets from teacher (captures uncertainty and relationships)
2. Hard labels from ground truth (ensures correctness)
3. Temperature parameter (controls softness of probability distribution)

Distillation Loss Configuration:
  Temperature: 3.0 (softens probabilities for richer knowledge transfer)
  Alpha: 0.7 (70% teacher knowledge, 30% ground truth)


In [None]:
print("\n" + "="*80)
print("STEP 6: DISTILLING KNOWLEDGE TO STUDENT MODEL")
print("="*80)
print("\nWhy: Now we train the student to mimic the teacher's behavior while also")
print("learning from the true labels. This allows the small model to perform")
print("nearly as well as the large model.\n")

# Create datasets and dataloaders for student
student_train_dataset = ImperativeDataset(train_texts, train_labels, student_tokenizer)
student_val_dataset = ImperativeDataset(val_texts, val_labels, student_tokenizer)
student_test_dataset = ImperativeDataset(test_texts, test_labels, student_tokenizer)

student_train_loader = DataLoader(student_train_dataset, batch_size=16, shuffle=True)
student_val_loader = DataLoader(student_val_dataset, batch_size=16, shuffle=False)
student_test_loader = DataLoader(student_test_dataset, batch_size=16, shuffle=False)

# Training configuration for student
student_epochs = 5  # More epochs for distillation
student_optimizer = AdamW(student_model.parameters(), lr=5e-5)
student_scheduler = get_linear_schedule_with_warmup(
    student_optimizer,
    num_warmup_steps=0,
    num_training_steps=len(student_train_loader) * student_epochs
)

distillation_loss_fn = DistillationLoss(temperature=3.0, alpha=0.7)

def train_student_with_distillation(student_model, teacher_model, train_loader,
                                   optimizer, scheduler, loss_fn, device):
    """
    Train student model using knowledge distillation.
    Student learns from both teacher's soft predictions and true labels.
    """
    student_model.train()
    teacher_model.eval()  # Teacher in eval mode (no training)

    total_loss = 0
    total_distill_loss = 0
    total_student_loss = 0

    for batch in tqdm(train_loader, desc="Distilling Knowledge"):
        optimizer.zero_grad()

        # Get student inputs
        student_input_ids = batch['input_ids'].to(device)
        student_attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)


        # Student forward pass
        student_outputs = student_model(
            input_ids=student_input_ids,
            attention_mask=student_attention_mask
        )
        student_logits = student_outputs.logits

        # Teacher forward pass (no gradient)
        with torch.no_grad():
            teacher_outputs = teacher_model(
                input_ids=student_input_ids,  # Using same tokens (simplified)
                attention_mask=student_attention_mask
            )
            teacher_logits = teacher_outputs.logits

        # Calculate distillation loss
        loss, distill_loss, ce_loss = loss_fn(student_logits, teacher_logits, labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        total_distill_loss += distill_loss.item()
        total_student_loss += ce_loss.item()

    avg_loss = total_loss / len(train_loader)
    avg_distill = total_distill_loss / len(train_loader)
    avg_student = total_student_loss / len(train_loader)

    return avg_loss, avg_distill, avg_student

# Train student with distillation
print("Training student model with knowledge distillation...")
best_accuracy = 0

for epoch in range(student_epochs):
    # Train with distillation
    loss, distill_loss, ce_loss = train_student_with_distillation(
        student_model, teacher_model, student_train_loader,
        student_optimizer, student_scheduler, distillation_loss_fn, device
    )

    # Evaluate
    val_accuracy, _, _ = evaluate_model(student_model, student_val_loader, device)

    print(f"\nEpoch {epoch+1}/{student_epochs}")
    print(f"  Total Loss: {loss:.4f}")
    print(f"  Distillation Loss: {distill_loss:.4f}")
    print(f"  Student CE Loss: {ce_loss:.4f}")
    print(f"  Val Accuracy: {val_accuracy:.4f}")

    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(student_model.state_dict(), 'best_student_model.pt')
        print(f"  ✓ New best model saved!")


STEP 6: DISTILLING KNOWLEDGE TO STUDENT MODEL

Why: Now we train the student to mimic the teacher's behavior while also
learning from the true labels. This allows the small model to perform
nearly as well as the large model.

Training student model with knowledge distillation...


Distilling Knowledge: 100%|██████████| 385/385 [00:47<00:00,  8.03it/s]
Evaluating: 100%|██████████| 83/83 [00:00<00:00, 130.43it/s]



Epoch 1/5
  Total Loss: 0.2883
  Distillation Loss: 0.3601
  Student CE Loss: 0.1207
  Val Accuracy: 0.9538
  ✓ New best model saved!


Distilling Knowledge: 100%|██████████| 385/385 [00:45<00:00,  8.47it/s]
Evaluating: 100%|██████████| 83/83 [00:00<00:00, 128.57it/s]



Epoch 2/5
  Total Loss: 0.2344
  Distillation Loss: 0.2925
  Student CE Loss: 0.0990
  Val Accuracy: 0.9568
  ✓ New best model saved!


Distilling Knowledge: 100%|██████████| 385/385 [00:46<00:00,  8.31it/s]
Evaluating: 100%|██████████| 83/83 [00:00<00:00, 133.59it/s]



Epoch 3/5
  Total Loss: 0.2159
  Distillation Loss: 0.2703
  Student CE Loss: 0.0892
  Val Accuracy: 0.9629
  ✓ New best model saved!


Distilling Knowledge: 100%|██████████| 385/385 [00:45<00:00,  8.39it/s]
Evaluating: 100%|██████████| 83/83 [00:00<00:00, 127.86it/s]



Epoch 4/5
  Total Loss: 0.1819
  Distillation Loss: 0.2267
  Student CE Loss: 0.0773
  Val Accuracy: 0.9606


Distilling Knowledge: 100%|██████████| 385/385 [00:46<00:00,  8.35it/s]
Evaluating: 100%|██████████| 83/83 [00:00<00:00, 132.41it/s]


Epoch 5/5
  Total Loss: 0.1584
  Distillation Loss: 0.1966
  Student CE Loss: 0.0693
  Val Accuracy: 0.9598





In [None]:
print("\n" + "="*80)
print("STEP 7: FINAL COMPARISON")
print("="*80)

# Evaluate teacher
teacher_accuracy, teacher_preds, true_labels = evaluate_model(
    teacher_model, teacher_test_loader, device
)

# Evaluate student
student_accuracy, student_preds, _ = evaluate_model(
    student_model, student_test_loader, device
)

print("\n" + "="*80)
print("KNOWLEDGE DISTILLATION RESULTS")
print("="*80)
print(f"\nTeacher Model Accuracy: {teacher_accuracy:.4f}")
print(f"Student Model Accuracy: {student_accuracy:.4f}")
print(f"Performance Retention: {(student_accuracy/teacher_accuracy)*100:.2f}%")
print(f"\nModel Size Reduction: {sum(p.numel() for p in teacher_model.parameters()) / sum(p.numel() for p in student_model.parameters()):.2f}x")

print("\nTeacher Classification Report:")
print(classification_report(true_labels, teacher_preds,
                          target_names=['Non-Imperative', 'Imperative']))

print("\nStudent Classification Report:")
print(classification_report(true_labels, student_preds,
                          target_names=['Non-Imperative', 'Imperative']))


STEP 7: FINAL COMPARISON


Evaluating: 100%|██████████| 83/83 [00:09<00:00,  8.87it/s]
Evaluating: 100%|██████████| 83/83 [00:00<00:00, 104.52it/s]


KNOWLEDGE DISTILLATION RESULTS

Teacher Model Accuracy: 0.9765
Student Model Accuracy: 0.9621
Performance Retention: 98.53%

Model Size Reduction: 24.96x

Teacher Classification Report:
                precision    recall  f1-score   support

Non-Imperative       0.99      0.97      0.98       658
    Imperative       0.97      0.99      0.98       662

      accuracy                           0.98      1320
     macro avg       0.98      0.98      0.98      1320
  weighted avg       0.98      0.98      0.98      1320


Student Classification Report:
                precision    recall  f1-score   support

Non-Imperative       0.96      0.96      0.96       658
    Imperative       0.96      0.96      0.96       662

      accuracy                           0.96      1320
     macro avg       0.96      0.96      0.96      1320
  weighted avg       0.96      0.96      0.96      1320






In [None]:
print("\n" + "="*80)
print("STEP 8: SAVING MODEL AND INFERENCE EXAMPLE")
print("="*80)

# Save the final model
student_model.save_pretrained('./distilled_student_model')
student_tokenizer.save_pretrained('./distilled_student_model')
print("\n✓ Student model saved to './distilled_student_model'")

# Inference example
def predict_imperative(text, model, tokenizer, device):
    """Make prediction on new text."""
    model.eval()
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1)

    return pred.item(), probs[0].cpu().numpy()

# Test examples
test_sentences = [
    "Close the door.",
    "The weather is nice today.",
    "Please submit your assignment by Friday.",
    "I went to the store yesterday."
]

print("\nInference Examples:")
print("-" * 60)
for sentence in test_sentences:
    pred, probs = predict_imperative(sentence, student_model, student_tokenizer, device)
    label = "Imperative" if pred == 1 else "Non-Imperative"
    confidence = probs[pred] * 100
    print(f"\nSentence: '{sentence}'")
    print(f"Prediction: {label} (confidence: {confidence:.2f}%)")
    print(f"Probabilities: Non-Imp={probs[0]:.3f}, Imp={probs[1]:.3f}")

print("\n" + "="*80)
print("KNOWLEDGE DISTILLATION COMPLETE!")
print("="*80)
print("\nKey Takeaways:")
print("1. Student model is much smaller but retains most of teacher's performance")
print("2. Distillation transfers 'dark knowledge' - soft probabilities, not just labels")
print("3. Temperature parameter allows student to learn from teacher's uncertainty")
print("4. Result: Fast, efficient model suitable for production deployment!")


STEP 8: SAVING MODEL AND INFERENCE EXAMPLE

✓ Student model saved to './distilled_student_model'

Inference Examples:
------------------------------------------------------------

Sentence: 'Close the door.'
Prediction: Imperative (confidence: 99.98%)
Probabilities: Non-Imp=0.000, Imp=1.000

Sentence: 'The weather is nice today.'
Prediction: Non-Imperative (confidence: 83.55%)
Probabilities: Non-Imp=0.836, Imp=0.164

Sentence: 'Please submit your assignment by Friday.'
Prediction: Imperative (confidence: 99.98%)
Probabilities: Non-Imp=0.000, Imp=1.000

Sentence: 'I went to the store yesterday.'
Prediction: Imperative (confidence: 99.97%)
Probabilities: Non-Imp=0.000, Imp=1.000

KNOWLEDGE DISTILLATION COMPLETE!

Key Takeaways:
1. Student model is much smaller but retains most of teacher's performance
2. Distillation transfers 'dark knowledge' - soft probabilities, not just labels
3. Temperature parameter allows student to learn from teacher's uncertainty
4. Result: Fast, efficient mode