<a href="https://colab.research.google.com/github/PigStep/Restourant-Sentimental-Analys-ML-based/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hj5992/restaurantreviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/restaurantreviews


In [78]:
import pandas as pd
import numpy as np

In [58]:
dataset = pd.read_csv("/root/.cache/kagglehub/datasets/hj5992/restaurantreviews/versions/1/Restaurant_Reviews.tsv", sep="\t")

In [60]:
dataset.shape

(1000, 2)

In [61]:
dataset.groupby("Liked").agg({"Liked":"count"})

Unnamed: 0_level_0,Liked
Liked,Unnamed: 1_level_1
0,500
1,500


# Data preparation

In [79]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # только буквы и пробелы
    return text

def tokenizeTxt(text):
  tokens = word_tokenize(text)
  return tokens

def remove_stopwords(tokens):
  stop_words = set(stopwords.words('english'))
  negations = {"no", "not", "never", "none", "nobody", "neither", "nor"}
  stop_words = stop_words - negations

  filtered_tokens = [word for word in tokens if word.casefold() not in stop_words]
  return filtered_tokens

def lemmatize(tokens):
  lemmatizer = WordNetLemmatizer()
  lemmas = [lemmatizer.lemmatize(word, pos='v') for word in tokens]  # 'was' → 'be'
  return lemmas

def merge_negative_tokens(tokens):
  skip = False
  negations = {"no", "not", "never", "none", "nobody", "neither", "nor"}
  merge=[]

  for i in range(len(tokens)):
    if skip:
      skip=False
      continue
    if tokens[i] in negations and i+1 <len(tokens):
      skip=True
      merge.append(f"{tokens[i]}_{tokens[i+1]}")
      continue
    merge.append(tokens[i])
  return merge

In [84]:
def preprocessText(text):
  text = clean_text(text)
  tokens = tokenizeTxt(text)
  tokens = remove_stopwords(tokens)
  tokens = lemmatize(tokens)
  tokens = merge_negative_tokens(tokens)
  return ' '.join(tokens)

dataset["tokens"] = dataset["Review"].apply(preprocessText)



In [85]:
dataset.head()

Unnamed: 0,Review,Liked,tokens
0,Wow... Loved this place.,1,wow love place
1,Crust is not good.,0,crust not_good
2,Not tasty and the texture was just nasty.,0,not_tasty texture nasty
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,selection menu great price


# Model creation

In [86]:
from sklearn.model_selection import train_test_split

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic regression

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# TF-IDF vectorization
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),  # Check with bigrams
    max_features=5000,
    stop_words='english'  # Adiitional filtration
)
X = tfidf.fit_transform(dataset['tokens'])
y = dataset['Liked']

base model predictions

In [88]:
model = LogisticRegression(
    C=1.0,                # Сила регуляризации
    penalty='l2',         # L2 регуляризация
    solver='liblinear',   # Оптимизатор для небольших датасетов
    class_weight='balanced'  # Балансировка классов
)
model.fit(X_train, y_train)

# Оценка модели
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.89      0.80        96
           1       0.87      0.70      0.78       104

    accuracy                           0.79       200
   macro avg       0.80      0.79      0.79       200
weighted avg       0.80      0.79      0.79       200

