In [1]:
# Импортируем все необходимые библиотеки и задаем сид для рандомизатора
import pandas as pd
import numpy as np
import string
import pickle
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score, precision_recall_curve, confusion_matrix, accuracy_score, classification_report, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt_tab')

from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Загрузка данных
df = pd.read_csv('geo-reviews-dataset-2023.csv')
df_1 = df.loc[df['label'] == 1]
df_2 = df.loc[df['label'] == 2]
df_3 = df.loc[df['label'] == 3]
df_4 = df.loc[df['label'] == 4]
df_5 = df.loc[df['label'] == 5]

data = pd.concat([df_1[0:5000], df_2[0:5000], df_3[0:5000], df_4[0:5000], df_5[0:5000]], ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True) #перемешать
data.shape

(25000, 2)

In [3]:
data.head()

Unnamed: 0,review,label
0,"Хорошая гостиница, брали 2 номера блочных, общ...",4
1,- Из окна был шикарный вид. - Нормальный тихо ...,4
2,Останавливались в этом отеле с 9.07 по 11.07. ...,2
3,"Территория красивая, здание как будто теремок,...",1
4,При открытии счёта картавая сотрудница ввела в...,1


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['label'], test_size = 0.25, random_state = 1)
y_train.value_counts()

label
2    3781
5    3772
3    3752
1    3750
4    3695
Name: count, dtype: int64

In [5]:
y_test.value_counts()

label
4    1305
1    1250
3    1248
5    1228
2    1219
Name: count, dtype: int64

In [6]:
#Предобработка текста
snowball = SnowballStemmer(language = "russian")
russian_stop_words = stopwords.words("russian")

def tokenize_sentence(sentence: str, remove_stop_words: bool = True):
    tokens = word_tokenize(sentence, language = "russian")
    tokens = [i for i in tokens if i not in string.punctuation]
    if remove_stop_words:
        tokens = [i for i in tokens if i not in russian_stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [7]:
# Создаем словарь с наиболее часто встречаемыми словами
processed = data["review"]
processed = processed.apply(lambda x: " ".join(tokenize_sentence(x,  remove_stop_words = True)))
processed

0        хорош гостиниц брал 2 номер блочн общ санузел ...
1        из окн шикарн вид нормальн тих работа конд хор...
2        останавлива отел 9.07 11.07 отел нов новострой...
3        территор красив здан теремок рек гор беседк от...
4        при открыт счет картав сотрудниц ввел заблужде...
                               ...                        
24995    мест красив интересн ест минус огроооомн очере...
24996    классн демонстрац хорош пленк очен понрав хват...
24997    рестора нрав уютн детская,2х уровневая.вкусн с...
24998    очен хорош санатор лучш крым питан уровн кухн ...
24999    ребенк стриг стригут отличн сам главн эт небол...
Name: review, Length: 25000, dtype: object

In [8]:
all_words = []
for text in processed:
    words = word_tokenize(text)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)

# Print the result
print("Number of words: {}".format(len(all_words)))
print("Most common words: {}".format(all_words.most_common(15)))
word_features = [x[0] for x in all_words.most_common(2000)]

Number of words: 55678
Most common words: [('очен', 12394), ('эт', 11315), ('в', 7062), ('``', 6718), ('мест', 5537), ('так', 5371), ('хорош', 5365), ('котор', 5227), ('все', 5177), ('сам', 4795), ('номер', 4767), ('магазин', 4550), ('цен', 4476), ('прост', 4323), ('вкусн', 4215)]


In [9]:
# Функция для нахождения фич в тексте
def find_features(text):
    words = word_tokenize(text)
    features = {}
    for word in word_features:
        features[word] = word in words

    return features

In [10]:
# Обучение модели логистической регрессии
vectorizer = TfidfVectorizer(tokenizer = lambda x: tokenize_sentence(x,  remove_stop_words = True), token_pattern=None)
features = vectorizer.fit_transform(X_train)
logreg_model = LogisticRegression(random_state = 0)
logreg_model.fit(features, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
X = vectorizer.fit_transform(X_train)
y_pred = logreg_model.predict(X)

In [12]:
# Проверка правильности модели на конкретном примере
logreg_model.predict(features[40])

array([4], dtype=int64)

In [13]:
X_train.iloc[40]

'Все соответствует цене. Обслуживание, питание, номера. Удобно что рядом пляж и остановка.'

In [14]:
logreg_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer = lambda x: tokenize_sentence(x, remove_stop_words=True), token_pattern=None)),
    ("model", LogisticRegression(random_state = 0 ))])
logreg_model_pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# Получаем метрики точности
y_pred = logreg_model_pipeline.predict(X_test)
pd.DataFrame(
    confusion_matrix(y_test, y_pred)
)

Unnamed: 0,0,1,2,3,4
0,849,270,96,15,20
1,346,445,263,117,48
2,172,315,389,316,56
3,42,97,279,651,236
4,18,19,47,199,945


In [16]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.52464
Classification Report:
              precision    recall  f1-score   support

           1       0.59      0.68      0.63      1250
           2       0.39      0.37      0.38      1219
           3       0.36      0.31      0.34      1248
           4       0.50      0.50      0.50      1305
           5       0.72      0.77      0.75      1228

    accuracy                           0.52      6250
   macro avg       0.51      0.52      0.52      6250
weighted avg       0.51      0.52      0.52      6250



In [18]:
# Обучение байесовского классификатора
mulnb_model = MultinomialNB()
mulnb_model.fit(features, y_train)
mulnb_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer = lambda x: tokenize_sentence(x, remove_stop_words=True), token_pattern=None)),
    ("model", MultinomialNB())])

In [19]:
mulnb_model_pipeline.fit(X_train, y_train)

In [20]:
y_pred_B = mulnb_model_pipeline.predict(X_test)
pd.DataFrame(
    confusion_matrix(y_test, y_pred_B),
    #index=[["actual", "actual"], ["negative", "positive"]],
    #columns=[["predicted", "predicted"], ["negative", "positive"]],
)

Unnamed: 0,0,1,2,3,4
0,845,306,59,22,18
1,389,515,184,101,30
2,210,399,299,301,39
3,63,190,234,636,182
4,40,26,47,255,860


In [21]:
accuracy = accuracy_score(y_test, y_pred_B)
report = classification_report(y_test, y_pred_B)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.5048
Classification Report:
              precision    recall  f1-score   support

           1       0.55      0.68      0.60      1250
           2       0.36      0.42      0.39      1219
           3       0.36      0.24      0.29      1248
           4       0.48      0.49      0.49      1305
           5       0.76      0.70      0.73      1228

    accuracy                           0.50      6250
   macro avg       0.50      0.51      0.50      6250
weighted avg       0.50      0.50      0.50      6250

