In [None]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import torch
from torch.utils.data import Dataset

from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)


In [None]:
df = pd.read_csv('sentiment_reviews.csv')
df.head()


In [None]:
def basic_preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_review"] = df["review"].apply(basic_preprocess)
df.head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_review"],
    df["label"],
    test_size=0.25,
    random_state=42,
    stratify=df["label"]
)


In [None]:
ml_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words="english"
    )),
    ("clf", LogisticRegression(max_iter=1000))
])

ml_pipeline.fit(X_train, y_train)

y_pred_ml = ml_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_ml))
print(classification_report(y_test, y_pred_ml, digits=3))


In [None]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    df["review"],
    df["label"],
    test_size=0.25,
    random_state=42,
    stratify=df["label"]
)


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = SentimentDataset(X_train_raw, y_train_raw, tokenizer)
test_dataset = SentimentDataset(X_test_raw, y_test_raw, tokenizer)


In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
training_args = TrainingArguments(
    output_dir="./bert_output",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,


    eval_strategy="epoch",
    logging_steps=20,
    learning_rate=2e-5,
    weight_decay=0.02

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



In [None]:
results = trainer.evaluate()
results


In [None]:
def predict_with_bert(texts):
    model.eval()
    enc = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=64,
        return_tensors="pt"
    )
    with torch.no_grad():
        out = model(
            input_ids=enc["input_ids"],
            attention_mask=enc["attention_mask"]
        )
        probs = torch.softmax(out.logits, dim=-1)
        preds = torch.argmax(probs, dim=-1).numpy()
    return preds, probs.numpy()

example = ["I loved the movie!", "Worst film ever."]
preds, probs = predict_with_bert(example)

list(zip(example, preds, probs))


In [None]:
!jupyter nbconvert --clear-output --inplace "Sentiment_Analysis (1).ipynb"
