In [5]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download("wordnet")
nltk.download("stopwords")

# ✅ Step 1: Load the Dataset
df = pd.read_csv("food_reviews.csv") # Load dataset
print(df.head()) # Display first few rows

# ✅ Step 2: Improved Text Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text) # Remove punctuation
    text = re.sub(r"\d+", "", text) # Remove numbers
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] # Lemmatization & Stopwords
    return " ".join(words)

df["cleaned_review"] = df["review"].apply(preprocess_text)

# ✅ Step 3: Convert Text to Numerical Features (TF-IDF)
vectorizer = TfidfVectorizer(max_features=1000)  # Increased feature size
X = vectorizer.fit_transform(df["cleaned_review"])
y = df["sentiment"]

# ✅ Step 4: Split Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Step 5: Hyperparameter Optimization with Class Weight Handling
param_grid = {
    "n_estimators": [50, 100],  # Reduced to prevent overfitting
    "max_depth": [None, 10],
    "min_samples_split": [5, 10]
}

rf_classifier = RandomForestClassifier(random_state=42, class_weight="balanced")  # Handle class imbalance

grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring="accuracy", n_jobs=1)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

# ✅ Step 6: Model Evaluation
y_pred = best_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ✅ Step 7: Sample Prediction
sample_texts = [
    "The pasta was very tasty.",
    "I couldn’t finish it, the taste was horrible.",
    "Every bite was an explosion of flavors, loved it!",
    "Not worth the money, very disappointing meal."
]
sample_texts = [preprocess_text(text) for text in sample_texts]
sample_vectorized = vectorizer.transform(sample_texts)
sample_predictions = best_rf.predict(sample_vectorized)

for text, prediction in zip(sample_texts, sample_predictions):
    print(f"\nSample Review: {text}")
    print("Predicted Sentiment:", "Positive" if prediction == 1 else "Negative")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cse-03\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cse-03\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


                                              review  sentiment
0  The pasta was cooked to perfection and tasted ...          1
1        I had the worst burger ever, totally burnt.          0
2                 The sushi was fresh and delicious.          1
3                   Soup was cold and had no flavor.          0
4                Absolutely loved the desserts here!          1
Best Model Accuracy: 1.00

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00        12

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20


Sample Review: pasta tasty
Predicted Sentiment: Positive

Sample Review: couldn’t finish taste horrible
Predicted Sentiment: Positive

Sample Review: every bite explosion flavor loved
Predicted Sentiment: Positive

Sample Review: worth m

In [4]:
!pip install nltk


Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.4.16-cp37-cp37m-win_amd64.whl.metadata (41 kB)
     -------------------------------------- 42.0/42.0 kB 406.8 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
     -------------------------------------- 57.7/57.7 kB 752.9 kB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 1.5/1.5 MB 5.7 MB/s eta 0:00:00
Downloading regex-2024.4.16-cp37-cp37m-win_amd64.whl (269 kB)
   ---------------------------------------- 269.6/269.6 kB 3.3 MB/s eta 0:00:00
Downloading click-8.1.8-py3-none-any.whl (98 kB)
   ---------------------------------------- 98.2/98.2 kB 1.1 MB/s eta 0:00:00
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
   -----------------------------