In [5]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import string
import re

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
file_path = '/content/Restaurant_Reviews 1.tsv'
df = pd.read_csv(file_path, delimiter='\t', quoting=3)

# Enhanced Preprocessing
lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(len(df)):
    review = re.sub(pattern='[^a-zA-Z]', repl=' ', string=df['Review'][i])
    review = review.lower()
    review_words = review.split()
    review_words = [word for word in review_words if word not in set(stopwords.words('english'))]
    review = ' '.join([lemmatizer.lemmatize(word) for word in review_words])
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [8]:
# Using TfidfVectorizer with fine-tuned parameters
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), min_df=5, max_df=0.7)
x = tfidf.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=104)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)

# Calculate and print accuracy score
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred))


Logistic Regression Accuracy: 0.79


In [9]:
# Confusion Matrix
confusion_matrix(y_test, y_pred)

array([[89, 15],
       [27, 69]])