In [1]:
# %% [markdown]
# # Project 5: NLP Sentiment Analysis
# **Objective:** Classify text as Positive or Negative using Natural Language Processing.
#
# **Tech Stack:**
# * Scikit-Learn (TF-IDF, Logistic Regression)
# * Pandas
# * Joblib

# %%
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# %%
# 1. Load Data
# We'll use a clean dataset of reviews from GitHub (Yelp/Amazon/IMDb combined subset)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/review_polarity.csv"
# Note: Since raw URLs can be unstable, if that fails, we can mock a dataset or use another source.
# Let's use a specific labelled dataset for robustness (SMS Spam is also common, but Sentiment is better for portfolio).
# Using a clear Sentiment Dataset:
url = "https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/imdb_sentiment.csv"

try:
    df = pd.read_csv(url)
    print("Dataset loaded.")
except:
    print("Fallback: Creating a dummy dataset for demonstration if URL fails.")
    data = {'review': ['I love this movie', 'This is terrible', 'Amazing product', 'Worst experience ever'],
            'sentiment': ['positive', 'negative', 'positive', 'negative']}
    df = pd.DataFrame(data)

# Check columns (Dataset usually has 'review' and 'sentiment')
df.head()

# %%
# 2. Preprocessing
# Convert labels to 0 and 1 if they are text
# Assuming dataset has 'sentiment' column with 'positive'/'negative'
if df['sentiment'].dtype == 'object':
    df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
else:
    df['label'] = df['sentiment'] # Already numeric

X = df['review']
y = df['label']

# %%
# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# %%
# 4. Build Pipeline
# Instead of doing Vectorization and Model separately, a Pipeline is cleaner.
# TF-IDF: Converts words to numbers based on importance.
# LogisticRegression: The classifier.

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('clf', LogisticRegression())
])

print("Training model...")
pipeline.fit(X_train, y_train)
print("Training completed.")

# %%
# 5. Evaluation
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Test with a custom string
test_review = ["The movie was fantastic and I loved the acting"]
prediction = pipeline.predict(test_review)[0]
print(f"Test review: '{test_review[0]}' -> Prediction: {'Positive' if prediction==1 else 'Negative'}")

# %%
# 6. Save Model
if not os.path.exists('models'):
    os.makedirs('models')

# Save the entire pipeline (includes the vectorizer vocabulary and the model)
joblib.dump(pipeline, 'models/nlp_sentiment_model.pkl')
print("✅ Success! Model saved to 'models/nlp_sentiment_model.pkl'")

Fallback: Creating a dummy dataset for demonstration if URL fails.
Training model...
Training completed.
Accuracy: 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

Test review: 'The movie was fantastic and I loved the acting' -> Prediction: Positive
✅ Success! Model saved to 'models/nlp_sentiment_model.pkl'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
