# Advanced Sentiment Analyzer

This notebook demonstrates a simple sentiment analysis pipeline with TF-IDF and Logistic Regression.
It includes preprocessing, training, evaluation, and prediction on new examples.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import re

## SentimentAnalyzer Class

This class handles preprocessing, training, evaluation, and prediction. It also prints detailed metrics and confusion matrix.

In [None]:
class SentimentAnalyzer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
        self.classifier = LogisticRegression(max_iter=1000)

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        return text.strip()

    def train(self, texts, labels):
        clean_texts = [self.preprocess_text(t) for t in texts]
        X = self.vectorizer.fit_transform(clean_texts)
        self.classifier.fit(X, labels)

    def evaluate(self, test_texts, test_labels):
        clean_texts = [self.preprocess_text(t) for t in test_texts]
        X_test = self.vectorizer.transform(clean_texts)
        y_pred = self.classifier.predict(X_test)

        print("Confusion matrix:\n", confusion_matrix(test_labels, y_pred))
        print("\nClassification report:\n", classification_report(test_labels, y_pred, zero_division=0))

        accuracy = accuracy_score(test_labels, y_pred)
        precision = precision_score(test_labels, y_pred, zero_division=0)
        recall = recall_score(test_labels, y_pred, zero_division=0)
        f1 = f1_score(test_labels, y_pred, zero_division=0)

        print(f"Accuracy:  {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall:    {recall:.2f}")
        print(f"F1-Score:  {f1:.2f}")

    def predict(self, text):
        text = self.preprocess_text(text)
        X = self.vectorizer.transform([text])
        pred = self.classifier.predict(X)[0]
        proba = self.classifier.predict_proba(X).max()
        label = 'Positive' if pred == 1 else 'Negative'
        return label, float(proba)

## Demo Dataset
A small IMDB-like dataset to simulate sentiment analysis.

In [None]:
texts = [
    "I loved this movie! It was fantastic.",
    "An excellent film with great performances.",
    "The movie was boring and too long.",
    "I hated this film. Terrible acting!",
    "Good story but weak direction.",
    "Absolutely amazing experience!",
    "Not worth watching again.",
    "It was okay, not bad but not great either."
]
labels = [1, 1, 0, 0, 0, 1, 0, 0]

# Train/test split with stratify
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.25, random_state=42, stratify=labels
)

print("Train labels distribution:", train_labels)
print("Test labels distribution: ", test_labels)

## Train Model

In [None]:
analyzer = SentimentAnalyzer()
analyzer.train(train_texts, train_labels)

## Evaluate Model

In [None]:
print("\n📊 Model Evaluation:")
analyzer.evaluate(test_texts, test_labels)

## Predict New Reviews

In [None]:
examples = [
    "This was a masterpiece!",
    "I regret watching this.",
    "Not a good film, sadly."
]

print("\n🎭 Try New Reviews:")
for example in examples:
    label, conf = analyzer.predict(example)
    print(f"'{example}' → {label} (confidence: {conf:.2f})")

### Notes
- Small dataset = unstable metrics; use a larger dataset for real results.
- TF-IDF with n-grams helps detect short phrases like 'not good'.
- Confusion matrix and classification report help diagnose issues.
- This structure can scale to thousands of reviews and more advanced preprocessing.