In [1]:
"""
Sentiment Analysis of Restaurant Reviews
----------------------------------------
This project applies Natural Language Processing (NLP) to classify restaurant reviews
as positive or negative using a Naive Bayes classifier.

Steps:
1. Data loading & exploration
2. Text preprocessing (cleaning, tokenization, stopword removal, stemming)
3. Feature extraction using Bag of Words
4. Model training & evaluation
5. Hyperparameter tuning (alpha for MultinomialNB)
6. Custom review prediction
"""

# ===============================
# 1. Import Required Libraries
# ===============================
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix
)

# Suppress NLTK download messages
warnings.filterwarnings("ignore", category=UserWarning, module='nltk')

# Download stopwords quietly
nltk.download('stopwords', quiet=True)


# ===============================
# 2. Load Dataset
# ===============================
data = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
print(f"Dataset Shape: {data.shape}")
print(f"Columns: {list(data.columns)}\n")
print(data.head(), "\n")
data.info()

# ===============================
# 3. Text Preprocessing
# ===============================
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
corpus = []

for review in data['Review']:
    # Keep only letters
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    words = review.split()

    # Remove stopwords and apply stemming
    words = [ps.stem(word) for word in words if word not in stop_words]

    corpus.append(' '.join(words))

# ===============================
# 4. Feature Extraction
# ===============================
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:, 1].values

# ===============================
# 5. Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0
)

# ===============================
# 6. Model Training (Naive Bayes)
# ===============================
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# ===============================
# 7. Model Evaluation
# ===============================
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)

print("\n--------- MODEL PERFORMANCE ---------")
print(f"Accuracy : {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall   : {recall*100:.2f}%")

# Confusion Matrix Plot
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues",
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ===============================
# 8. Hyperparameter Tuning (Alpha)
# ===============================
best_accuracy = 0.0
best_alpha = 0.0
print("\n------ Alpha Tuning Results ------")
for alpha in np.arange(0.1, 1.1, 0.1):
    temp_clf = MultinomialNB(alpha=alpha)
    temp_clf.fit(X_train, y_train)
    temp_pred = temp_clf.predict(X_test)
    score = accuracy_score(y_test, temp_pred)
    print(f"Alpha={alpha:.1f} => Accuracy: {score*100:.2f}%")
    if score > best_accuracy:
        best_accuracy = score
        best_alpha = alpha

print(f"\nBest Accuracy: {best_accuracy*100:.2f}% with Alpha={best_alpha:.1f}")

# Retrain with best alpha
classifier = MultinomialNB(alpha=best_alpha)
classifier.fit(X_train, y_train)

# ===============================
# 9. Custom Review Prediction Function
# ===============================
def predict_sentiment(sample_review: str) -> str:
    """
    Predicts sentiment (Positive/Negative) for a given review string.
    """
    sample_review = re.sub('[^a-zA-Z]', ' ', sample_review)
    sample_review = sample_review.lower()
    words = sample_review.split()
    words = [ps.stem(word) for word in words if word not in stop_words]
    final_review = ' '.join(words)
    vector = cv.transform([final_review]).toarray()
    prediction = classifier.predict(vector)[0]
    return "Positive Review" if prediction == 1 else "Negative Review"

# ===============================
# 10. Sample Predictions
# ===============================
sample_reviews = [
    "The food is really bad.",
    "Food was pretty bad and the service was very slow.",
    "The food was absolutely wonderful, from preparation to presentation, very pleasing.",
    "food averag"
]

print("\n------ SAMPLE PREDICTIONS ------")
for review in sample_reviews:
    print(f"Review: {review}")
    print(f"Sentiment: {predict_sentiment(review)}\n")

# ===============================
# 11. Interactive User Input
# ===============================
while True:
    user_input = input("Enter a review (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting sentiment prediction. Goodbye!")
        break
    print("Predicted Sentiment:", predict_sentiment(user_input), "\n")

ModuleNotFoundError: No module named 'nltk'