In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data
nltk.download('stopwords')

# Load Dataset
# Ensure your TSV file is formatted as "Review" and "Sentiment" (1 for positive, 0 for negative)
data = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3)

# Step 1: Data Cleaning and Preprocessing
corpus = []
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

for review in data['Review']:
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', review)
    # Convert to lowercase
    review = review.lower()
    # Tokenize and remove stopwords
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stop_words]
    # Rejoin words into a single string
    review = ' '.join(review)
    corpus.append(review)

# Step 2: Feature Extraction
# Bag of Words (BOW)
cv = CountVectorizer(max_features=1500)  # Limit to 1500 most frequent words
X_bow = cv.fit_transform(corpus).toarray()

# TF-IDF
tfidf = TfidfVectorizer(max_features=1500)
X_tfidf = tfidf.fit_transform(corpus).toarray()

# Labels (Target)
y = data['Liked'].values

# Step 3: Train-Test Split
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 4: Model Training and Evaluation
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear', C=1),
    "AdaBoost": AdaBoostClassifier(n_estimators=50),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

def evaluate_models(X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy
        print(f"{name} Accuracy: {accuracy:.4f}")
        print(classification_report(y_test, y_pred))
        print("-" * 50)
    return results

print("Evaluating on BOW Features:")
results_bow = evaluate_models(X_train_bow, X_test_bow, y_train, y_test)

print("Evaluating on TF-IDF Features:")
results_tfidf = evaluate_models(X_train_tfidf, X_test_tfidf, y_train, y_test)

# Step 5: Compare Results
results_df = pd.DataFrame({
    "Model": list(results_bow.keys()),
    "BOW Accuracy": list(results_bow.values()),
    "TF-IDF Accuracy": list(results_tfidf.values())
}).sort_values(by="BOW Accuracy", ascending=False)

print(results_df)

# Visualization
plt.figure(figsize=(10, 6))
sns.barplot(x="Model", y="BOW Accuracy", data=results_df, label="BOW", color="b", alpha=0.7)
sns.barplot(x="Model", y="TF-IDF Accuracy", data=results_df, label="TF-IDF", color="r", alpha=0.7)
plt.xticks(rotation=45)
plt.title("Model Accuracy Comparison (BOW vs. TF-IDF)")
plt.legend()
plt.show()
