# Sentiment Analysis —

This notebook contains a complete end-to-end sentiment analysis pipeline (data inspection, cleaning, preprocessing, EDA, vectorization, modeling, evaluation, and saving artifacts). Copy & run cells in Jupyter. Adjust file paths if necessary.

In [None]:

# === Imports & Settings ===
import os, re, warnings, joblib
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

warnings.filterwarnings("ignore")
plt.rcParams.update({"figure.max_open_warning": 0, "figure.figsize": (8,4)})


In [None]:

# -------------------------
# Paths & Input
# -------------------------
INPUT_PATH = "/mnt/data/sentiment_reviews.csv"  # change if needed
CLEANED_CSV = "/mnt/data/sentiment_reviews_cleaned.csv"
MODEL_DIR = "/mnt/data/models"
os.makedirs(MODEL_DIR, exist_ok=True)

df = pd.read_csv(INPUT_PATH)
print("Shape:", df.shape)
display(df.head())
print("\nMissing values:\n", df.isnull().sum())


In [None]:

# -------------------------
# Cleaning
# -------------------------
df = df.drop_duplicates(subset=["review_text"]).copy()
df["review_text"] = df["review_text"].astype(str)
df = df[df["review_text"].str.strip().astype(bool)].copy()
if "sentiment" in df.columns:
    df = df[df["sentiment"].notna()].copy()
else:
    raise ValueError("No 'sentiment' column found.")

if "review_date" in df.columns:
    df["review_date"] = pd.to_datetime(df["review_date"], errors="coerce")

print("After cleaning shape:", df.shape)
display(df.head())


In [None]:

# -------------------------
# Preprocessing (text)
# -------------------------
try:
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
except Exception:
    class _NaiveStemmer:
        def stem(self, w):
            for suf in ("ing","ly","ed","es","s"):
                if w.endswith(suf) and len(w) > len(suf)+2:
                    return w[:-len(suf)]
            return w
    stemmer = _NaiveStemmer()

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = [stemmer.stem(tok) for tok in text.split() if len(tok) > 1]
    return " ".join(tokens)

df["text_clean"] = df["review_text"].apply(preprocess_text)
display(df[["review_text", "text_clean"]].head())


In [None]:

# -------------------------
# EDA
# -------------------------
sent_counts = df["sentiment"].value_counts()
print("Sentiment distribution:\n", sent_counts)
plt.bar(sent_counts.index, sent_counts.values); plt.title("Sentiment distribution"); plt.show()

if "rating" in df.columns:
    rating_counts = df["rating"].value_counts().sort_index()
    print("Rating distribution:\n", rating_counts)
    plt.bar(rating_counts.index.astype(str), rating_counts.values); plt.title("Rating distribution"); plt.show()

def top_n_words(series, n=20):
    cnt = Counter()
    for t in series:
        cnt.update(t.split())
    return pd.DataFrame(cnt.most_common(n), columns=["word","count"])

print("\nTop words overall:")
display(top_n_words(df["text_clean"], n=20))

for s in df["sentiment"].unique():
    print(f"\nTop words for sentiment = {s}:")
    top_df = top_n_words(df[df["sentiment"]==s]["text_clean"], n=15)
    display(top_df)
    plt.bar(top_df["word"], top_df["count"]); plt.title(f"Top words ({s})"); plt.xticks(rotation=45); plt.tight_layout(); plt.show()

if "review_date" in df.columns and df["review_date"].notna().sum() > 0:
    df_time = df.set_index("review_date").resample("M").size().rename("count")
    if len(df_time) > 1:
        plt.plot(df_time.index, df_time.values); plt.title("Reviews over time (monthly)"); plt.show()


In [None]:

# -------------------------
# Vectorize & Encode
# -------------------------
X = df["text_clean"].values.astype(str)
y = df["sentiment"].values

le = LabelEncoder()
y_enc = le.fit_transform(y)
print("Label classes:", le.classes_)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")
X_vec = vectorizer.fit_transform(X)
print("Vectorized shape:", X_vec.shape)


In [3]:

# -------------------------
# Train/Test split & Models
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

models = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    print(f"\n--- Training: {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    print(f"{name} accuracy: {acc:.4f} f1_macro: {f1:.4f}")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:\n", cm)
    results[name] = {"model": model, "accuracy": acc, "f1_macro": f1}


NameError: name 'X_vec' is not defined

In [4]:

# -------------------------
# Save best model & artifacts
# -------------------------
best_name = max(results.keys(), key=lambda k: results[k]["f1_macro"])
best_model = results[best_name]["model"]
print("Best model:", best_name, "f1_macro:", results[best_name]["f1_macro"])

joblib.dump(best_model, os.path.join(MODEL_DIR, f"{best_name}_model.pkl"))
joblib.dump(vectorizer, os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl"))
joblib.dump(le, os.path.join(MODEL_DIR, "label_encoder.pkl"))
print("Saved artifacts to:", MODEL_DIR)

df.to_csv(CLEANED_CSV, index=False)
print("Saved cleaned CSV to:", CLEANED_CSV)


NameError: name 'results' is not defined

In [5]:

# -------------------------
# Predict helper (examples)
# -------------------------
def predict_texts(texts, model=best_model, vectorizer_obj=vectorizer, label_enc=le):
    texts_clean = [preprocess_text(t) for t in texts]
    Xv = vectorizer_obj.transform(texts_clean)
    preds = model.predict(Xv)
    return label_enc.inverse_transform(preds)

samples = [
    "This product is amazing, works exactly as described. Highly recommend!",
    "Terrible service, product broke after 2 days. Do not buy.",
    "It's okay, not great but not bad either."
]
print("Sample predictions:")
for s, p in zip(samples, predict_texts(samples)):
    print("->", s, "=>", p)

# Example how to load outside:
# loaded_vec = joblib.load('/mnt/data/models/tfidf_vectorizer.pkl')
# loaded_model = joblib.load('/mnt/data/models/MultinomialNB_model.pkl')
# loaded_le = joblib.load('/mnt/data/models/label_encoder.pkl')


NameError: name 'best_model' is not defined

### Optional: Hyperparameter tuning
Use GridSearchCV on LogisticRegression or RandomForest for better hyperparameters. Example code (commented) is available in the previous chat messages.

## Extension: Advanced EDA, Cross-Validation, Hyperparameter Tuning, Error Analysis & Pipeline

This extension adds the following sections:

1. Advanced EDA (wordcloud, class balance plots, length distributions)
2. Cross-validation and more robust model evaluation
3. Hyperparameter tuning using GridSearchCV for LogisticRegression & RandomForest
4. Error analysis — show misclassified examples and analyze common mistakes
5. Build and save an sklearn `Pipeline` containing preprocessing + vectorizer + model

Run the cells in order after running the earlier notebook cells (or run the whole notebook).

In [6]:

# -------------------------
# Advanced EDA
# -------------------------
# 1) Review length distribution
df['review_length'] = df['review_text'].str.len()
print('Review length stats:')
print(df['review_length'].describe())

plt.figure(figsize=(8,4))
plt.hist(df['review_length'], bins=30)
plt.title('Review length distribution')
plt.xlabel('characters'); plt.ylabel('frequency')
plt.show()

# 2) Class balance percentage
prop = df['sentiment'].value_counts(normalize=True) * 100
print('\nClass balance (%):\n', prop)
plt.figure(figsize=(6,4))
plt.pie(prop.values, labels=prop.index, autopct='%1.1f%%')
plt.title('Sentiment share (%)')
plt.show()

# 3) Optional: WordCloud (if wordcloud package installed)
try:
    from wordcloud import WordCloud
    all_text = ' '.join(df['text_clean'].values)
    wc = WordCloud(width=800, height=400).generate(all_text)
    plt.figure(figsize=(10,4))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud (all reviews)')
    plt.show()
except Exception as e:
    print('wordcloud not available or failed to generate — skip. (Install wordcloud for nicer visuals)')


NameError: name 'df' is not defined

In [7]:

# -------------------------
# Cross-validation: evaluate models using cross_val_score (3-fold)
from sklearn.model_selection import cross_val_score, StratifiedKFold

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
cv_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_vec, y_enc, cv=skf, scoring='f1_macro', n_jobs=-1)
    cv_results[name] = scores
    print(f'{name}  f1_macro scores: {scores}  mean: {scores.mean():.4f}  std: {scores.std():.4f}')

# Visualize CV results
plt.figure(figsize=(6,4))
plt.boxplot([cv_results[n] for n in cv_results.keys()], labels=list(cv_results.keys()))
plt.title('Cross-validation f1_macro distribution')
plt.ylabel('f1_macro')
plt.show()


NameError: name 'models' is not defined

In [8]:

# -------------------------
# Hyperparameter tuning (GridSearchCV) for LogisticRegression and RandomForest
from sklearn.model_selection import GridSearchCV
tuned_models = {}

# Logistic Regression grid
param_grid_lr = {
    'C':[0.01, 0.1, 1, 10],
    'penalty':['l2'],
    'solver':['lbfgs']
}
gs_lr = GridSearchCV(LogisticRegression(max_iter=2000), param_grid_lr, cv=3, scoring='f1_macro', n_jobs=-1)
gs_lr.fit(X_train, y_train)
print('LogisticRegression best params:', gs_lr.best_params_, 'best f1_macro:', gs_lr.best_score_)
tuned_models['LogisticRegression'] = gs_lr.best_estimator_

# Random Forest grid (keep it small to save time)
param_grid_rf = {
    'n_estimators':[50, 100],
    'max_depth':[None, 20],
    'min_samples_split':[2,5]
}
gs_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1_macro', n_jobs=-1)
gs_rf.fit(X_train, y_train)
print('RandomForest best params:', gs_rf.best_params_, 'best f1_macro:', gs_rf.best_score_)
tuned_models['RandomForest'] = gs_rf.best_estimator_

# Evaluate tuned models on test set
for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    print(f'\n{name} on test set:')
    print('accuracy:', accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=le.classes_))


NameError: name 'X_train' is not defined

In [9]:

# -------------------------
# Error analysis: show misclassified examples from the best baseline model (or tuned model if better)
# We'll use the best_model (selected earlier). If tuned logistic/rf improved, you can replace best_model.
from sklearn.metrics import ConfusionMatrixDisplay

# Confusion matrix display for best_model
y_pred_best = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_best)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix (best_model)')
plt.show()

# Show some misclassified examples
test_texts = np.array(df['text_clean'])[ (X_vec.shape[0] - X_test.shape[0]): ] if False else None
# Instead, we'll reconstruct test indices using train_test_split again to match original split
# Recreate indices to find which rows were in X_test
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in sss.split(X_vec, y_enc):
    pass

test_texts_orig = df.iloc[test_idx]['review_text'].values
test_texts_clean = df.iloc[test_idx]['text_clean'].values
y_test_orig = y_enc[test_idx]
y_pred_orig = best_model.predict(X_vec[test_idx])

# Find indices where predicted != true
mis_idx = (y_pred_orig != y_test_orig).nonzero()[0]
print(f'Misclassified count: {len(mis_idx)} (showing up to 20 examples)')
for i in mis_idx[:20]:
    print('\n--- Example ---')
    print('Original review:', test_texts_orig[i])
    print('Cleaned review:', test_texts_clean[i])
    print('True label:', le.inverse_transform([y_test_orig[i]])[0])
    print('Predicted:', le.inverse_transform([y_pred_orig[i]])[0])


NameError: name 'best_model' is not defined

In [None]:

# -------------------------
# Create sklearn Pipeline (preprocessing (identity) + vectorizer + model)
from sklearn.pipeline import Pipeline
# We'll include a simple identity transformer for text since preprocessing uses our function
from sklearn.base import BaseEstimator, TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return [preprocess_text(x) for x in X]

# Build pipeline using the tuned logistic regression (if exists) else best_model
chosen_model = tuned_models.get('LogisticRegression', best_model)
pipeline = Pipeline([
    ('preprocess', TextPreprocessor()),
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')),
    ('clf', chosen_model)
])

# Fit pipeline on full data for final artifact
pipeline.fit(df['review_text'].values, y_enc)
# Save pipeline
pipeline_path = os.path.join(MODEL_DIR, 'sentiment_pipeline.pkl')
joblib.dump(pipeline, pipeline_path)
print('Saved sklearn Pipeline to:', pipeline_path)


### Final notes

- This extended notebook added advanced EDA, cross-validation, hyperparameter tuning, error analysis, and an sklearn `Pipeline` saved as `sentiment_pipeline.pkl`.
- After running, you can upload the extended notebook to GitHub and include the model files in a `models/` folder (or link to a cloud storage if file size is large).
- If you want, I can also create a short `README.md` describing the project (problem statement, dataset, approach, results) suitable for GitHub. Would you like that?