# E-Commerce Generative AI - Sentiment Classification

This notebook demonstrates a simple **fine-tuning pipeline** for e-commerce product reviews using DistilBERT.

📌 Steps:
1. Load & explore dataset
2. Tokenize text
3. Fine-tune model
4. Evaluate accuracy
5. Run sample predictions


In [None]:
# Install dependencies (if running in Colab)
!pip install -q torch transformers scikit-learn pandas matplotlib

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Simulated dataset (replace later with real reviews)
data = {
    "review": [
        "This product is amazing!",
        "Worst purchase ever, waste of money.",
        "Great quality and fast shipping.",
        "Terrible experience, item broke in a week.",
        "Excellent value for money.",
        "Not worth the price.",
        "Absolutely loved it! Highly recommend.",
        "The packaging was awful and item arrived broken."
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0]  # 1=Positive, 0=Negative
}

df = pd.DataFrame(data)
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["label"], test_size=0.25, random_state=42)
len(X_train), len(X_test)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_dataset = ReviewDataset(X_train, y_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
optimizer = AdamW(model.parameters(), lr=1e-5)

model.train()
losses = []
for epoch in range(2):
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [None]:
import matplotlib.pyplot as plt
plt.plot(losses)
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.show()

In [None]:
def predict(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        label = torch.argmax(probs).item()
    return "Positive" if label == 1 else "Negative"

print(predict("The product was fantastic!"))
print(predict("Worst experience ever."))