In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [5]:

!pip install datasets transformers torch scikit-learn pandas numpy nltk -q

from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import re
import nltk
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)


nltk.download('stopwords')


print("🛠️ Preparing dataset...")
dataset = load_dataset("yelp_review_full")

full_df = pd.DataFrame(dataset["train"].shuffle(seed=42).select(range(30000)))

def map_labels(star_rating):
    if star_rating <= 1: return 0
    elif star_rating == 2: return 1
    else: return 2

full_df["label"] = full_df["label"].apply(map_labels)

balanced_df = pd.concat([
    full_df[full_df["label"] == 0].sample(5000),
    full_df[full_df["label"] == 1].sample(5000),
    full_df[full_df["label"] == 2].sample(5000)
])

print("📊 Balanced distribution:\n", balanced_df["label"].value_counts())

print("\n🧹 Cleaning text...")
custom_stopwords = set(nltk.corpus.stopwords.words("english")) - {'not', 'no', 'nor', 'but'}

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|@\w+|[^\w\s]|\d+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return " ".join([word for word in text.split() if word not in custom_stopwords])

balanced_df["cleaned_text"] = balanced_df["text"].apply(clean_text)


train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)


print("\n🤖 Training LR Model...")
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words=list(custom_stopwords)
)

X_train_tf = tfidf.fit_transform(train_df["cleaned_text"])
X_test_tf = tfidf.transform(test_df["cleaned_text"])

lr_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="saga"
)
lr_model.fit(X_train_tf, train_df["label"])

print("\n📈 LR Performance:")
y_pred = lr_model.predict(X_test_tf)
print(classification_report(test_df["label"], y_pred))


print("\n🧠 Initializing Transformer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3,
    id2label={0: "negative", 1: "neutral", 2: "positive"}
)


hf_train = Dataset.from_pandas(train_df[["text", "label"]])
hf_test = Dataset.from_pandas(test_df[["text", "label"]])

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

tokenized_train = hf_train.map(tokenize_function, batched=True)
tokenized_test = hf_test.map(tokenize_function, batched=True)


def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    f1_all = f1_score(p.label_ids, preds, average=None)
    return {
        "accuracy": (preds == p.label_ids).mean(),
        "f1_negative": f1_all[0],
        "f1_neutral": f1_all[1],
        "f1_positive": f1_all[2]
    }

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    report_to="none",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

print("\n🔥 Training Transformer...")
trainer.train()


class SentimentAnalyzer:
    def __init__(self):
        self.tfidf = tfidf
        self.lr_model = lr_model
        self.transformer_pipe = pipeline(
            "text-classification",
            model=model,
            tokenizer=tokenizer,
            device=0 if torch.cuda.is_available() else -1,
            top_k=3,
            truncation=True,
            max_length=128
        )

    def analyze(self, text):

        cleaned = clean_text(text)
        lr_pred = self.lr_model.predict(self.tfidf.transform([cleaned]))[0]


        tr_result = {res["label"]: res["score"] for res in self.transformer_pipe(text)[0]}

        return {
            "text": text,
            "traditional": ["negative", "neutral", "positive"][lr_pred],
            "transformer": max(tr_result, key=tr_result.get),
            "confidence": {k: f"{v:.2f}" for k, v in tr_result.items()}
        }


print("\n🔮 Testing Neutral Cases:")
analyzer = SentimentAnalyzer()

tests = [
    "The product works but nothing special really.",
    "Service was acceptable, though the wait time was too long.",
    "Food was average - not bad but not good either.",
    "Package arrived damaged but customer service resolved the issue.",
    "It's okay, I guess. Not worth the price but not terrible either."
]

for text in tests:
    result = analyzer.analyze(text)
    print(f"\n📝 {result['text']}")
    print(f"   Traditional: {result['traditional'].upper()}")
    print(f"   Transformer: {result['transformer'].upper()}")
    print(f"   Confidence: {result['confidence']}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🛠️ Preparing dataset...
📊 Balanced distribution:
 label
0    5000
1    5000
2    5000
Name: count, dtype: int64

🧹 Cleaning text...

🤖 Training LR Model...

📈 LR Performance:
              precision    recall  f1-score   support

           0       0.81      0.76      0.78      1037
           1       0.60      0.64      0.62       970
           2       0.75      0.74      0.75       993

    accuracy                           0.72      3000
   macro avg       0.72      0.71      0.72      3000
weighted avg       0.72      0.72      0.72      3000


🧠 Initializing Transformer...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]




🔥 Training Transformer...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Negative,F1 Neutral,F1 Positive
1,0.6875,0.653895,0.717,0.779736,0.60443,0.755725
2,0.5754,0.644587,0.727333,0.790871,0.620939,0.763845
3,0.4588,0.692681,0.717,0.77365,0.604531,0.768024
4,0.3716,0.729125,0.711,0.773603,0.596527,0.758901


Device set to use cuda:0



🔮 Testing Neutral Cases:

📝 The product works but nothing special really.
   Traditional: NEGATIVE
   Transformer: NEGATIVE
   Confidence: {'negative': '0.51', 'neutral': '0.47', 'positive': '0.02'}

📝 Service was acceptable, though the wait time was too long.
   Traditional: NEUTRAL
   Transformer: NEUTRAL
   Confidence: {'neutral': '0.66', 'negative': '0.28', 'positive': '0.06'}

📝 Food was average - not bad but not good either.
   Traditional: NEUTRAL
   Transformer: NEUTRAL
   Confidence: {'neutral': '0.59', 'negative': '0.39', 'positive': '0.02'}

📝 Package arrived damaged but customer service resolved the issue.
   Traditional: NEGATIVE
   Transformer: NEGATIVE
   Confidence: {'negative': '0.80', 'neutral': '0.17', 'positive': '0.03'}

📝 It's okay, I guess. Not worth the price but not terrible either.
   Traditional: NEGATIVE
   Transformer: NEUTRAL
   Confidence: {'neutral': '0.72', 'negative': '0.25', 'positive': '0.03'}
