In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

We are working on text data to identify and classify the sentiment based on user review. For text analysis, we need various preprocessing steps like removing stop words, lemmatization. Thus, nltk is a preferred library for the task.

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Data Loading

In [5]:
# Step 1: Load the dataset
data = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', encoding='utf-8')
# added delimiter='\t' to specify tab as the delimiter

In [8]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [9]:
# Generate summary statistics for the 'messages'
data.describe()

Unnamed: 0,Liked
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [12]:
type(stopwords.words('english'))

list

# Text Preprocessing

In [13]:
# Step 2: Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [14]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [15]:
# Apply preprocessing to the 'review' column
data['processed_review'] = data['Review'].apply(preprocess_text)

In [16]:
data

Unnamed: 0,Review,Liked,processed_review
0,Wow... Loved this place.,1,wow loved place
1,Crust is not good.,0,crust good
2,Not tasty and the texture was just nasty.,0,tasty texture nasty
3,Stopped by during the late May bank holiday of...,1,stopped late may bank holiday rick steve recom...
4,The selection on the menu was great and so wer...,1,selection menu great price
...,...,...,...
995,I think food should have flavor and texture an...,0,think food flavor texture lacking
996,Appetite instantly gone.,0,appetite instantly gone
997,Overall I was not impressed and would not go b...,0,overall impressed would go back
998,"The whole experience was underwhelming, and I ...",0,whole experience underwhelming think well go n...


In [18]:
data[['Review','processed_review']].iloc[0]

Unnamed: 0,0
Review,Wow... Loved this place.
processed_review,wow loved place


In [19]:
# Step 3: Split the dataset into training and testing sets
X = data['processed_review']
y = data['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Step 4: Create a pipeline with TfidfVectorizer and Naive Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=5, max_df=0.7)),
    ('classifier', MultinomialNB())
])

In [21]:
# Step 5: Train the model
pipeline.fit(X_train, y_train)

In [22]:
# Step 6: Make predictions on the test set
y_pred = pipeline.predict(X_test)

In [23]:
# Step 7: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.72

Confusion Matrix:
[[76 20]
 [36 68]]

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.79      0.73        96
           1       0.77      0.65      0.71       104

    accuracy                           0.72       200
   macro avg       0.73      0.72      0.72       200
weighted avg       0.73      0.72      0.72       200



In [24]:
# Step 8: Test with new reviews
new_reviews = [
        "The food was terrible and service was slow!",
        "Amazing experience, loved the ambiance!",
        "Worst restaurant ever! Never coming back."
]

In [25]:
# Preprocess new reviews
processed_new_reviews = [preprocess_text(review) for review in new_reviews]

# Make predictions
new_predictions = pipeline.predict(processed_new_reviews)

for review, sentiment in zip(new_reviews, new_predictions):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}\n")

Review: The food was terrible and service was slow!
Predicted Sentiment: 0

Review: Amazing experience, loved the ambiance!
Predicted Sentiment: 1

Review: Worst restaurant ever! Never coming back.
Predicted Sentiment: 0

