In [1]:
# --- importing Libraries ----
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

In [3]:
# --- Download NLTK data ---
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
    print("Download complete.")


In [7]:
# --- Load and Prepare Data ---
print("Loading dataset...")
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
print("Dataset loaded successfully.")

Loading dataset...
Dataset loaded successfully.


In [9]:
# --- Initialize stemmer and stop words ---
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

In [11]:
# --- Cleans and preprocesses a single review string ---
def preprocess_text(text):
    # Keep only letters and replace others with space
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower()
    review = review.split()
    # Stem words and remove stopwords
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    return ' '.join(review)


In [13]:
print("Preprocessing text data ...")
corpus = dataset['Review'].apply(preprocess_text)
y = dataset['Liked'].values
print("Preprocessing complete.")

Preprocessing text data ...
Preprocessing complete.


In [17]:
corpus

0                                         wow love place
1                                         crust not good
2                                 not tasti textur nasti
3      stop late may bank holiday rick steve recommen...
4                                select menu great price
                             ...                        
995                        think food flavor textur lack
996                               appetit instantli gone
997                 overal not impress would not go back
998    whole experi underwhelm think go ninja sushi n...
999    wast enough life pour salt wound draw time too...
Name: Review, Length: 1000, dtype: object

In [21]:
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

In [23]:
# ---  Split Data ---
X_train, X_test, y_train, y_test = train_test_split(dataset['Review'], y, test_size=0.20, random_state=42)

In [25]:
# ---  Build and Train the Pipeline ---
print("Building and training the model pipeline...")
# The pipeline integrates vectorization and classification
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1500, ngram_range=(1, 2))), # Use TF-IDF with 1-grams and 2-grams
    ('clf', MultinomialNB()), # Use Multinomial Naive Bayes classifier
])

# Train the entire pipeline on the raw text data
text_clf.fit(X_train, y_train)
print("Training complete.")

Building and training the model pipeline...
Training complete.


In [27]:
print("\n--- Model Evaluation ---")
y_pred = text_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))



--- Model Evaluation ---
Accuracy: 0.8000

Confusion Matrix:
[[83 13]
 [27 77]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.75      0.86      0.81        96
    Positive       0.86      0.74      0.79       104

    accuracy                           0.80       200
   macro avg       0.81      0.80      0.80       200
weighted avg       0.81      0.80      0.80       200



In [29]:
# ---  Save the Model ---
print("\nSaving the trained pipeline to 'sentiment_model.pkl'...")
joblib.dump(text_clf, 'sentiment_model.pkl')
print("Model saved successfully !")


Saving the trained pipeline to 'sentiment_model.pkl'...
Model saved successfully !
