In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW


In [None]:
!pip install transformers torch datasets -q

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm


In [None]:
# Load dataset
df = pd.read_csv("/content/Reviews_30k.csv")  # change path if needed
df.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
def map_rating_to_label(x):
    if x <= 2:
        return 0
    elif x == 3:
        return 1
    else:
        return 2

df['label'] = df['Score'].apply(map_rating_to_label)
df = df[['Text', 'label']]  # keep only what we need
df.head()


Unnamed: 0,Text,label
0,I have bought several of the Vitality canned d...,2
1,Product arrived labeled as Jumbo Salted Peanut...,0
2,This is a confection that has been around a fe...,2
3,If you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,2


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_enc   = tokenizer(val_texts,   truncation=True, padding=True, max_length=128)


In [None]:
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_ds = ReviewDataset(train_enc, train_labels)
val_ds   = ReviewDataset(val_enc, val_labels)


In [None]:
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=8, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(2):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} | Train loss: {total_loss/len(train_loader):.4f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 1 | Train loss: 0.3875


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 2 | Train loss: 0.2563


In [None]:
model.save_pretrained("/content/fine_tuned_distilbert_3class")
tokenizer.save_pretrained("/content/fine_tuned_distilbert_3class")
print("✅ Model fine-tuned and saved successfully!")


✅ Model fine-tuned and saved successfully!


In [None]:
from transformers import pipeline

clf = pipeline("text-classification",
               model="/content/fine_tuned_distilbert_3class",
               tokenizer="/content/fine_tuned_distilbert_3class")

sample = "The product was okay, not too bad but could be better."
print(clf(sample))


Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.6982341408729553}]


In [None]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
preds, labels = [], []

for batch in val_loader:
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
    labels.extend(batch['labels'].cpu().numpy())

print("Accuracy:", accuracy_score(labels, preds))
print(classification_report(labels, preds))


Accuracy: 0.872
              precision    recall  f1-score   support

           0       0.82      0.66      0.73       927
           1       0.49      0.31      0.38       499
           2       0.90      0.98      0.94      4574

    accuracy                           0.87      6000
   macro avg       0.74      0.65      0.68      6000
weighted avg       0.86      0.87      0.86      6000



In [None]:
from transformers import pipeline

# Load your fine-tuned model
clf = pipeline("text-classification",
               model="/content/fine_tuned_distilbert_31class",
               tokenizer="/content/fine_tuned_distilbert_31class")

# Define label mapping
label_map = {
    'LABEL_0': 'Negative',
    'LABEL_1': 'Neutral',
    'LABEL_2': 'Positive'
}

# --- Interactive prediction ---
while True:
    text = input("Enter a review (or type 'exit' to stop): ")
    if text.lower() == "exit":
        break
    result = clf(text)[0]
    sentiment = label_map[result['label']]
    confidence = result['score']
    print(f"\n📄 Review: {text}")
    print(f"🧠 Predicted Sentiment: {sentiment}  (confidence={confidence:.3f})\n")


KeyboardInterrupt: 

In [None]:
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=8, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} | Train loss: {total_loss/len(train_loader):.4f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 1 | Train loss: 0.3964


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 2 | Train loss: 0.2604


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 3 | Train loss: 0.1712


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 4 | Train loss: 0.1112


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 5 | Train loss: 0.0753


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 6 | Train loss: 0.0511


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 7 | Train loss: 0.0441


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 8 | Train loss: 0.0350


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 9 | Train loss: 0.0302


  0%|          | 0/3000 [00:00<?, ?it/s]

Epoch 10 | Train loss: 0.0288


In [None]:
model.save_pretrained("/content/fine_tuned_distilbert_31class")
tokenizer.save_pretrained("/content/fine_tuned_distilbert_31class")
print("✅ Model fine-tuned and saved successfully!")


✅ Model fine-tuned and saved successfully!


In [None]:
from transformers import pipeline

clf = pipeline("text-classification",
               model="/content/fine_tuned_distilbert_31class",
               tokenizer="/content/fine_tuned_distilbert_31class")

sample = "The product was okay, not too bad but could be better."
print(clf(sample))


Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9984660148620605}]


In [None]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
preds, labels = [], []

for batch in val_loader:
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
    labels.extend(batch['labels'].cpu().numpy())

print("Accuracy:", accuracy_score(labels, preds))
print(classification_report(labels, preds))


Accuracy: 0.8621666666666666
              precision    recall  f1-score   support

           0       0.78      0.66      0.71       927
           1       0.39      0.50      0.44       499
           2       0.94      0.94      0.94      4574

    accuracy                           0.86      6000
   macro avg       0.70      0.70      0.70      6000
weighted avg       0.87      0.86      0.87      6000



In [None]:
from transformers import pipeline

# Load fine-tuned model
clf = pipeline("text-classification",
               model="/content/fine_tuned_distilbert_31class",
               tokenizer="/content/fine_tuned_distilbert_31class")

# Label mapping for model output
label_map = {
    'LABEL_0': 'Negative',
    'LABEL_1': 'Neutral',
    'LABEL_2': 'Positive'
}

# Rating mapping logic
def get_rating_sentiment(rating):
    if rating <= 2:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

# Function to detect spam
def detect_spam(review_sentiment, rating_sentiment):
    if review_sentiment != rating_sentiment:
        return "Spam 🚨"
    else:
        return "Non-Spam ✅"

# --- Interactive input loop ---
while True:
    try:
        rating_input = input("Enter rating (1–5) or type 'exit' to stop: ")
        if rating_input.lower() == "exit":
            break

        rating = int(rating_input)
        if rating not in [1, 2, 3, 4, 5]:
            print("❌ Invalid rating! Please enter between 1–5.\n")
            continue

        review = input("Enter review text: ")

        # Predict review sentiment
        model_output = clf(review)[0]
        review_sentiment = label_map[model_output['label']]
        confidence = model_output['score']

        # Derive rating sentiment
        rating_sentiment = get_rating_sentiment(rating)

        # Compare for spam detection
        review_status = detect_spam(review_sentiment, rating_sentiment)

        print("\n📄 Review:", review)
        print(f"⭐ Rating Given: {rating} → {rating_sentiment}")
        print(f"🧠 Predicted Review Sentiment: {review_sentiment} (confidence={confidence:.3f})")
        print(f"📊 Review Status: {review_status}\n")

    except ValueError:
        print("❌ Please enter a valid number for rating.\n")


Device set to use cuda:0


Enter rating (1–5) or type 'exit' to stop: 1
Enter review text: the product was bad

📄 Review: the product was bad
⭐ Rating Given: 1 → Negative
🧠 Predicted Review Sentiment: Negative (confidence=0.975)
📊 Review Status: Non-Spam ✅

Enter rating (1–5) or type 'exit' to stop: 3
Enter review text: Absolutely loved the product! Works like magic

📄 Review: Absolutely loved the product! Works like magic
⭐ Rating Given: 3 → Neutral
🧠 Predicted Review Sentiment: Positive (confidence=0.999)
📊 Review Status: Spam 🚨

Enter rating (1–5) or type 'exit' to stop: 5
Enter review text: Absolutely loved the product! Works like magic

📄 Review: Absolutely loved the product! Works like magic
⭐ Rating Given: 5 → Positive
🧠 Predicted Review Sentiment: Positive (confidence=0.999)
📊 Review Status: Non-Spam ✅

Enter rating (1–5) or type 'exit' to stop: 3
Enter review text: The product is average, nothing special

📄 Review: The product is average, nothing special
⭐ Rating Given: 3 → Neutral
🧠 Predicted Review Se

KeyboardInterrupt: Interrupted by user