<a href="https://colab.research.google.com/github/Preethikuppuri/Preethikuppuri/blob/main/Training%20a%20Small%20Language%20Model%20(SLM)%20from%20a%20Pre-trained%20LLM%20for%20Text%20Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# 📌 Step 1: Install Required Libraries
!pip install transformers datasets torch -q

# 📌 Step 2: Import Libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 📌 Step 3: Load IMDB Dataset
dataset = load_dataset("imdb")

# 📌 Step 4: Load Tokenizer & Model (Small Model: DistilBERT)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 📌 Step 5: Tokenize Data
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

# 📌 Step 6: Small Subset for Quick Training (demo purpose)
small_train = tokenized_dataset["train"].shuffle(seed=42).select(range(500))
small_test = tokenized_dataset["test"].shuffle(seed=42).select(range(200))

# 📌 Step 7: Define Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# 📌 Step 8: Training Setup
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
    eval_steps=500
)



# 📌 Step 9: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test,
    compute_metrics=compute_metrics,
)

# 📌 Step 10: Train the Model
trainer.train()

# 📌 Step 11: Evaluate
metrics = trainer.evaluate()
print(metrics)

# 📌 Step 12: Save Model
model.save_pretrained("./slm_finetuned")
tokenizer.save_pretrained("./slm_finetuned")

print("✅ Training Complete! Model saved in ./slm_finetuned")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpreethikuppuri1309[0m ([33mpreethikuppuri1309-umbc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


KeyboardInterrupt: 

In [3]:
!pip install -U transformers datasets -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
# 📌 Step 1: Install Required Libraries
!pip install transformers datasets torch -q

# 📌 Step 2: Import Libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 📌 Step 3: Load IMDB Dataset
dataset = load_dataset("imdb")

# 📌 Step 4: Load Tokenizer & Model (Small Model: DistilBERT)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 📌 Step 5: Tokenize Data
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

# 📌 Step 6: Small Subset for Quick Training (demo purpose)
small_train = tokenized_dataset["train"].shuffle(seed=42).select(range(500))
small_test = tokenized_dataset["test"].shuffle(seed=42).select(range(200))

# 📌 Step 7: Define Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# 📌 Step 8: Training Setup (GPU-friendly, disabled W&B, mixed precision, step-limited)
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    fp16=True,          # mixed precision for faster GPU training
    report_to=[],       # disables W&B logging
    max_steps=50        # optional, limits total training steps for quick demo
)

# 📌 Step 9: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test,
    compute_metrics=compute_metrics,
)

# 📌 Step 10: Train the Model
trainer.train()

# 📌 Step 11: Evaluate
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# 📌 Step 12: Save Model
model.save_pretrained("./slm_finetuned")
tokenizer.save_pretrained("./slm_finetuned")

print("✅ Training Complete! Model saved in ./slm_finetuned")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Step,Training Loss


Step,Training Loss
50,0.6852




Evaluation Metrics: {'eval_loss': 0.6777768135070801, 'eval_accuracy': 0.63, 'eval_f1': 0.421875, 'eval_precision': 0.84375, 'eval_recall': 0.28125, 'eval_runtime': 74.9437, 'eval_samples_per_second': 2.669, 'eval_steps_per_second': 0.334, 'epoch': 0.7936507936507936}
✅ Training Complete! Model saved in ./slm_finetuned


In [8]:
# 📌 Step 13: Test the Trained SLM Model

from transformers import pipeline

# Load the fine-tuned model
classifier = pipeline("text-classification", model="./slm_finetuned", tokenizer="./slm_finetuned")

# Example texts to test
texts = [
    "I absolutely loved this movie! The story was amazing.",
    "The film was boring and too long. I did not enjoy it at all.",
    "An average movie, some parts were good, some were bad."
]

# Make predictions
for text in texts:
    result = classifier(text)
    print(f"Text: {text}\nPrediction: {result}\n")


Device set to use cpu


Text: I absolutely loved this movie! The story was amazing.
Prediction: [{'label': 'LABEL_0', 'score': 0.5035985112190247}]

Text: The film was boring and too long. I did not enjoy it at all.
Prediction: [{'label': 'LABEL_0', 'score': 0.531001627445221}]

Text: An average movie, some parts were good, some were bad.
Prediction: [{'label': 'LABEL_0', 'score': 0.5102379322052002}]



In [None]:
# 📌 Step 1: Install Required Libraries
!pip install transformers datasets torch -q

# 📌 Step 2: Import Libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 📌 Step 3: Load IMDB Dataset
dataset = load_dataset("imdb")

# 📌 Step 4: Load Tokenizer & Model (DistilBERT SLM)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 📌 Step 5: Tokenize Data
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

# 📌 Step 6: Use Subset for Demo (small) or Full Training (larger)
train_subset = tokenized_dataset["train"].shuffle(seed=42).select(range(2000))  # full: 25k
test_subset = tokenized_dataset["test"].shuffle(seed=42).select(range(500))      # full: 25k

# 📌 Step 7: Define Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# 📌 Step 8: Training Setup
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,      # increase for better learning
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    fp16=True,               # GPU mixed precision
    report_to=[]             # disable W&B
)

# 📌 Step 9: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=test_subset,
    compute_metrics=compute_metrics,
)

# 📌 Step 10: Train the Model
trainer.train()

# 📌 Step 11: Evaluate
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# 📌 Step 12: Save Model
model.save_pretrained("./slm_finetuned")
tokenizer.save_pretrained("./slm_finetuned")
print("✅ Training Complete! Model saved in ./slm_finetuned")

# 📌 Step 13: Test the Trained SLM Model
from transformers import pipeline
classifier = pipeline("text-classification", model="./slm_finetuned", tokenizer="./slm_finetuned")

# Example predictions
examples = [
    "I absolutely loved this movie! The story was amazing.",
    "The film was boring and too long. I did not enjoy it at all.",
    "An average movie, some parts were good, some were bad."
]

for text in examples:
    result = classifier(text)
    print(f"Text: {text}\nPrediction: {result}\n")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



Step,Training Loss
50,0.6583
100,0.4113
150,0.4263


Step,Training Loss
50,0.6583
100,0.4113
150,0.4263
200,0.4014
