Sentiments Analyisis for Restaurant Review

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample


In [None]:
# Load the dataset
df = pd.read_csv("Restaurant_Reviews (1).csv")


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Liked'], test_size=0.2, random_state=42)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Balance the dataset (optional)
positive_reviews = df[df['Liked'] == 1]
negative_reviews = df[df['Liked'] == 0]

In [None]:
# Upsample the minority class (negative reviews) to balance the dataset
negative_reviews_upsampled = resample(negative_reviews, replace=True, n_samples=len(positive_reviews), random_state=42)


In [None]:
# Combine the balanced data
balanced_df = pd.concat([positive_reviews, negative_reviews_upsampled])


In [None]:
# Split the balanced dataset
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(
    balanced_df['Review'],
    balanced_df['Liked'],
    test_size=0.2,
    random_state=42
)

In [None]:
# Create a TF-IDF vectorizer with n-grams
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))


In [None]:
# Transform the text data into TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_balanced)
X_test_tfidf = tfidf_vectorizer.transform(X_test_balanced)


In [None]:
# Train a Logistic Regression model with hyperparameter tuning
model_lr = LogisticRegression(max_iter=1000, C=1.0)  # You can experiment with different values of C
model_lr.fit(X_train_tfidf, y_train_balanced)


In [None]:
# Predict sentiments on the test set
y_pred_lr = model_lr.predict(X_test_tfidf)


In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test_balanced, y_pred_lr) * 100 ,"%")
print(classification_report(y_test_balanced, y_pred_lr))


Accuracy: 88.0 %
              precision    recall  f1-score   support

           0       0.90      0.87      0.88       104
           1       0.86      0.90      0.88        96

    accuracy                           0.88       200
   macro avg       0.88      0.88      0.88       200
weighted avg       0.88      0.88      0.88       200

