<a href="https://colab.research.google.com/github/PigStep/Restourant-Sentimental-Analys-ML-based/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hj5992/restaurantreviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/restaurantreviews


In [91]:
import pandas as pd
import numpy as np

In [92]:
dataset = pd.read_csv("/root/.cache/kagglehub/datasets/hj5992/restaurantreviews/versions/1/Restaurant_Reviews.tsv", sep="\t")

In [93]:
dataset.shape

(1000, 2)

In [94]:
dataset.groupby("Liked").agg({"Liked":"count"})

Unnamed: 0_level_0,Liked
Liked,Unnamed: 1_level_1
0,500
1,500


# Data preparation

In [95]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # только буквы и пробелы
    return text

def tokenizeTxt(text):
  tokens = word_tokenize(text)
  return tokens

def remove_stopwords(tokens):
  stop_words = set(stopwords.words('english'))
  negations = {"no", "not", "never", "none", "nobody", "neither", "nor"}
  stop_words = stop_words - negations

  filtered_tokens = [word for word in tokens if word.casefold() not in stop_words]
  return filtered_tokens

def lemmatize(tokens):
  lemmatizer = WordNetLemmatizer()
  lemmas = [lemmatizer.lemmatize(word, pos='v') for word in tokens]  # 'was' → 'be'
  return lemmas

def merge_negative_tokens(tokens):
  skip = False
  negations = {"no", "not", "never", "none", "nobody", "neither", "nor"}
  merge=[]

  for i in range(len(tokens)):
    if skip:
      skip=False
      continue
    if tokens[i] in negations and i+1 <len(tokens):
      skip=True
      merge.append(f"{tokens[i]}_{tokens[i+1]}")
      continue
    merge.append(tokens[i])
  return merge

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [96]:
def preprocessText(text):
  text = clean_text(text)
  tokens = tokenizeTxt(text)
  tokens = remove_stopwords(tokens)
  tokens = lemmatize(tokens)
  tokens = merge_negative_tokens(tokens)
  return ' '.join(tokens)

df = dataset["Review"].apply(preprocessText)
df.head()

Unnamed: 0,Review
0,wow love place
1,crust not_good
2,not_tasty texture nasty
3,stop late may bank holiday rick steve recommen...
4,selection menu great price


# Model creation

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# TF-IDF vectorization
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),  # Check with bigrams
    max_features=5000,
    stop_words='english'  # Adiitional filtration
)
X = tfidf.fit_transform(df)
y = dataset['Liked']

## Logistic regression

In [98]:
from sklearn.model_selection import train_test_split

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

base model predictions

In [99]:
model = LogisticRegression(
    C=1.0,                # Сила регуляризации
    penalty='l2',         # L2 регуляризация
    solver='liblinear',   # Оптимизатор для небольших датасетов
    class_weight='balanced'  # Балансировка классов
)
model.fit(X_train, y_train)

# Оценка модели
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.89      0.80        96
           1       0.87      0.70      0.78       104

    accuracy                           0.79       200
   macro avg       0.80      0.79      0.79       200
weighted avg       0.80      0.79      0.79       200



### Feature Grid Search

In [100]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [101]:
def run_grid_search(param_grid):
  pipeline = Pipeline([
      ('tfidf', TfidfVectorizer()),
      ('clf', LogisticRegression(class_weight='balanced'))
  ])

  grid_search = GridSearchCV(
      pipeline,
      param_grid,
      cv=5,
      scoring='f1_weighted',    # Optimizing by f1
      n_jobs=-1,
      verbose=1
  )

  grid_search.fit(dataset['Review'], dataset['Liked'])

  print("Best params:", grid_search.best_params_)
  print("Best F1-score:", grid_search.best_score_)

In [102]:
param_grid = {
      'tfidf__ngram_range': [(1, 1), (1, 2)],  # Only words or with bigrams
      'tfidf__max_features': [1000, 5000],
      'clf__C': [0.1, 1.0, 10.0],
      'clf__penalty': ['l1', 'l2'],             # Type of regularization
      'clf__solver': ['liblinear', 'saga']      # Optimization algoriphm
  }

# run_grid_search(param_grid)
# Best params: {'clf__C': 10.0, 'clf__penalty': 'l2', 'clf__solver': 'saga', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.8128993073820239

param_grid = {
      'tfidf__ngram_range': [(1, 2), (1, 3)],
      'tfidf__max_features': [5000, 7000, 10000],
      'clf__C': [10.0, 15.00, 25.00],
  }
# run_grid_search(param_grid)
# Best params: {'clf__C': 25.0, 'tfidf__max_features': 7000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.8149600538151471

param_grid = {
      'tfidf__ngram_range': [(1, 2)],
      'tfidf__max_features': [6000, 7000, 8000],
      'clf__C': [25.00, 35.00, 50.00],
  }
# run_grid_search(param_grid)
# Best params: {'clf__C': 35.0, 'tfidf__max_features': 7000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.8199671874750051

param_grid = {
      'tfidf__ngram_range': [(1, 2)],
      'tfidf__max_features': range(6000,8000,100),
      'clf__C': range(25,50,5),
  }
# run_grid_search(param_grid)
# Best params: {'clf__C': 30, 'tfidf__max_features': 6600, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.8219659874750981

In [103]:
feature_names = tfidf.get_feature_names_out()
coefs = model.coef_[0]
top_positive = sorted(zip(coefs, feature_names), reverse=True)[:10]
top_negative = sorted(zip(coefs, feature_names))[:10]

print("Top-10 positive n-grams:")
for score, word in top_positive:
    print(f"{word}: {score:.2f}")
print('')
print("Top-10 negative n-grams:")
for score, word in top_negative:
    print(f"{word}: {score:.2f}")

Top-10 positive n-grams:
great: 3.26
good: 2.90
delicious: 2.00
love: 1.69
amaze: 1.45
awesome: 1.40
friendly: 1.31
nice: 1.21
excellent: 1.00
wont disappoint: 0.98

Top-10 negative n-grams:
dont: -1.42
bad: -1.28
minutes: -1.10
worst: -1.01
not_good: -1.01
wont: -0.95
wasnt: -0.95
probably: -0.88
bland: -0.86
slow: -0.86


In [108]:
pipeline = Pipeline([
      ('tfidf', TfidfVectorizer(
          preprocessor=preprocessText,
          ngram_range=(1, 2),
          max_features=6600
      )),
      ('clf', LogisticRegression(
          C=30,
          penalty="l2",
          solver="saga",
          class_weight='balanced'
          ))
  ])

#Create "pure" data for pipeline
dataset = pd.read_csv("/root/.cache/kagglehub/datasets/hj5992/restaurantreviews/versions/1/Restaurant_Reviews.tsv", sep="\t")
X = dataset['Review']
y = dataset['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.84      0.81        96
           1       0.85      0.79      0.82       104

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.82       200



In [110]:
import joblib

# Define the filename for your pipeline
filename = 'restaurant_review_pipelineLR.joblib'

# Save the pipeline to the file
joblib.dump(pipeline, filename)

print(f"Pipeline saved to {filename}")

Pipeline saved to restaurant_review_pipelineLR.joblib


## SVM

In [111]:
from sklearn.svm import SVC

pipeline = Pipeline([
      ('tfidf', TfidfVectorizer(
          preprocessor=preprocessText,
          ngram_range=(1, 2),
          max_features=6600
      )),
      ('clf', SVC(kernel='linear'))
  ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train,y_train)
y_pred_svm = pipeline.predict(X_test)

# Evaluate the SVM model
print("SVM Model Classification Report:")
print(classification_report(y_test, y_pred_svm))


SVM Model Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.88      0.82        96
           1       0.87      0.76      0.81       104

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.81       200



In [113]:
def run_grid_searchSVM(param_grid):
  pipeline = Pipeline([
      ('tfidf', TfidfVectorizer()),
      ('clf', SVC())
  ])

  grid_search = GridSearchCV(
      pipeline,
      param_grid,
      cv=5,
      scoring='f1_weighted',    # Optimizing by f1
      n_jobs=-1,
      verbose=1
  )

  grid_search.fit(dataset['Review'], dataset['Liked'])

  print("Best params:", grid_search.best_params_)
  print("Best F1-score:", grid_search.best_score_)

In [125]:
param_grid_svm = {
    'clf__C': [0.1, 1, 10, 100],
    'clf__gamma': [1, 0.1, 0.01, 0.001],
    'clf__kernel': ['rbf', 'linear'],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_features': [1000, 5000]
}
# run_grid_searchSVM(param_grid_svm)
# Best params: {'clf__C': 100, 'clf__gamma': 0.1, 'clf__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.8209770675991559

param_grid_svm = {
    'clf__C': [50, 70, 100],
    'clf__gamma': [0.5, 0.1, 0.05],
    'clf__kernel': ['rbf', 'linear'],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_features': [3000, 5000, 7000]
}
# run_grid_searchSVM(param_grid_svm)
# Best params: {'clf__C': 50, 'clf__gamma': 0.5, 'clf__kernel': 'rbf', 'tfidf__max_features': 7000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.827973134694221

param_grid_svm = {
    'clf__C': [30, 50, 75],
    'clf__gamma': [0.8, 0.5, 0.3],
    'clf__kernel': ['rbf'],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_features': [5000, 7000, 9000]
}
# run_grid_searchSVM(param_grid_svm)
# Best params: {'clf__C': 30, 'clf__gamma': 0.5, 'clf__kernel': 'rbf', 'tfidf__max_features': 7000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.827973134694221

param_grid_svm = {
    'clf__C': range(10,30,5),
    'clf__gamma': np.arange(0.3,0.8,0.1),
    'clf__kernel': ['rbf'],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_features': range(6000,8000,100)
}
# run_grid_searchSVM(param_grid_svm)
# Best params: {'clf__C': 10, 'clf__gamma': np.float64(0.5), 'clf__kernel': 'rbf', 'tfidf__max_features': 6600, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.8289823188556905

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Best params: {'clf__C': 10, 'clf__gamma': np.float64(0.5), 'clf__kernel': 'rbf', 'tfidf__max_features': 6600, 'tfidf__ngram_range': (1, 2)}
Best F1-score: 0.8289823188556905


In [127]:
pipeline = Pipeline([
      ('tfidf', TfidfVectorizer(
          preprocessor=preprocessText,
          ngram_range=(1, 2),
          max_features=6600
      )),
      ('clf', SVC(
          C=10,
          gamma=0.5,
          kernel="rbf",
          ))
  ])

#Create "pure" data for pipeline
dataset = pd.read_csv("/root/.cache/kagglehub/datasets/hj5992/restaurantreviews/versions/1/Restaurant_Reviews.tsv", sep="\t")
X = dataset['Review']
y = dataset['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.84      0.81        96
           1       0.85      0.79      0.82       104

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.82       200



In [128]:
# Define the filename for your pipeline
filename = 'restaurant_review_pipelineSVM.joblib'

# Save the pipeline to the file
joblib.dump(pipeline, filename)

print(f"Pipeline saved to {filename}")

Pipeline saved to restaurant_review_pipelineSVM.joblib
