In [495]:
# Импортируем все необходимые библиотеки и задаем сид для рандомизатора
import pandas as pd
import numpy as np
import string
import pickle
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score, precision_recall_curve, confusion_matrix, accuracy_score, classification_report, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt_tab')

from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [496]:
# Загрузка данных
data = pd.read_csv('data/general_good_BG.csv')
data = data.sort_values(by='assessment')
data = data[0:300]
data.shape

(300, 2)

In [497]:
data.head(5)

Unnamed: 0,review,assessment
18,Ужасное качество исполнения. Отвратительное. Н...,0
19,"Бочонки маленькие, фишки обычный картон. Не по...",0
28,"Детям игра понравилась, но от любопытства реши...",0
30,"Игра длится минут 15-20, никакого азарта. Зака...",0
31,"Вопросов по доставке, получению заказа, и даже...",0


In [498]:
# Количество позитивных (1) и отрицательных отзывов (0)
data['assessment'].value_counts()

assessment
1    154
0    146
Name: count, dtype: int64

In [499]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['assessment'], test_size = 0.25, random_state = 15)
y_train.value_counts()

assessment
1    117
0    108
Name: count, dtype: int64

In [500]:
y_test.value_counts()

assessment
0    38
1    37
Name: count, dtype: int64

In [501]:
#Предобработка текста
snowball = SnowballStemmer(language = "russian")
russian_stop_words = stopwords.words("russian")

def tokenize_sentence(sentence: str, remove_stop_words: bool = True):
    tokens = word_tokenize(sentence, language = "russian")
    tokens = [i for i in tokens if i not in string.punctuation]
    if remove_stop_words:
        tokens = [i for i in tokens if i not in russian_stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [502]:
# Создаем словарь с наиболее часто встречаемыми словами
processed = data["review"]
processed = processed.apply(lambda x: " ".join(tokenize_sentence(x,  remove_stop_words = True)))
processed

18     ужасн качеств исполнен отвратительн нельз испо...
19     бочонк маленьк фишк обычн картон не понрав пок...
28     дет игр понрав любопытств реш вручн откр глаз ...
30     игр длит минут 15-20 никак азарт заканчива вык...
31     вопрос доставк получен заказ упаковк возникл и...
                             ...                        
274      игр отличн игра все сем дета крепк совет покупк
273    игр отличн все сем игра качеств хорош тольк ма...
272    игр отличн коробочк хотел получш картон потолщ...
303    игр понрав игра вмест внучк по очеред собира р...
302              игр понрав шайб очен лета дет довольны.
Name: review, Length: 300, dtype: object

In [503]:
all_words = []
for text in processed:
    words = word_tokenize(text)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)

# Print the result
print("Number of words: {}".format(len(all_words)))
print("Most common words: {}".format(all_words.most_common(15)))
word_features = [x[0] for x in all_words.most_common(2000)]

Number of words: 2398
Most common words: [('игр', 265), ('.', 107), ('очен', 94), ('игра', 91), ('эт', 86), ('сам', 71), ('интересн', 67), ('``', 66), ('ребенк', 64), ('прост', 58), ('качеств', 54), ('так', 53), ('куп', 47), ('дет', 47), ('котор', 44)]


In [504]:
# Функция для нахождения фич в тексте
def find_features(text):
    words = word_tokenize(text)
    features = {}
    for word in word_features:
        features[word] = word in words

    return features

In [None]:
# Обучение модели логистической регрессии
vectorizer = TfidfVectorizer(tokenizer = lambda x: tokenize_sentence(x,  remove_stop_words = True), token_pattern=None)
features = vectorizer.fit_transform(X_train)
logreg_model = LogisticRegression(random_state = 0 )
logreg_model.fit(features, y_train)

In [506]:
X = vectorizer.fit_transform(X_train)
y_pred = logreg_model.predict(X)

In [507]:
# Проверка правильности модели на конкретном примере
logreg_model.predict(features[40])

array([0])

In [508]:
X_train.iloc[40]

'Ужасное качество фишек , древесина (загатовка ) из такого дешёвого материала , что аж сыпятся сами цифры. А самое обидно что не хватало фишки под номер 8'

In [509]:
logreg_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer = lambda x: tokenize_sentence(x, remove_stop_words=True), token_pattern=None)),
    ("model", LogisticRegression(random_state = 0 ))])
logreg_model_pipeline.fit(X_train, y_train)

In [510]:
# Получаем метрики точности
y_pred = logreg_model_pipeline.predict(X_test)
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[["actual", "actual"], ["negative", "positive"]],
    columns=[["predicted", "predicted"], ["negative", "positive"]],
)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,negative,positive
actual,negative,30,8
actual,positive,5,32


In [511]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8266666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.82        38
           1       0.80      0.86      0.83        37

    accuracy                           0.83        75
   macro avg       0.83      0.83      0.83        75
weighted avg       0.83      0.83      0.83        75



In [512]:
# Сохранение модели со словарем фич и функцие нахождения их в тексте
path = 'models\\naive_bayes_classifier.pickle'

with open(path, 'wb') as classifier_file:
    data_for_save = {
        'model': logreg_model,
        'features': word_features,
        'function': find_features,
        }
    pickle.dump(data_for_save, classifier_file)

In [513]:
#f = open('models\\naive_bayes_classifier.pickle', 'rb')
#sd = pickle.load(f)
#print(sd)

In [514]:
# Обучение байесовского классификатора
mulnb_model = MultinomialNB()
mulnb_model.fit(features, y_train)
mulnb_model_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(tokenizer = lambda x: tokenize_sentence(x, remove_stop_words=True), token_pattern=None)),
    ("model", MultinomialNB())])

In [515]:
mulnb_model_pipeline.fit(X_train, y_train)

In [516]:
y_pred_B = mulnb_model_pipeline.predict(X_test)
pd.DataFrame(
    confusion_matrix(y_test, y_pred_B),
    index=[["actual", "actual"], ["negative", "positive"]],
    columns=[["predicted", "predicted"], ["negative", "positive"]],
)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,negative,positive
actual,negative,30,8
actual,positive,7,30


In [517]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8266666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.82        38
           1       0.80      0.86      0.83        37

    accuracy                           0.83        75
   macro avg       0.83      0.83      0.83        75
weighted avg       0.83      0.83      0.83        75



In [518]:
# Сохранение модели со словарем фич и функцие нахождения их в тексте
path = 'models\\logistic_regression_classifier.pickle'

with open(path, 'wb') as classifier_file:
    data_for_save = {
        'model': mulnb_model,
        'features': word_features,
        'function': find_features,
        }
    pickle.dump(data_for_save, classifier_file)

In [519]:
# Если мы хотим найти 95% негативных комментариев, то...
precision_score(y_test, y_pred)

0.8

In [520]:
recall_score(y_test, y_pred)

0.8648648648648649

In [521]:
prec, rec, thresholds = precision_recall_curve(y_test, probas_pred=mulnb_model_pipeline.predict_proba(X_test)[:, 1])



In [522]:
np.where(prec > 0.95)

(array([69, 70, 71, 72, 73, 74]),)

In [523]:
thresholds[72]

np.float64(0.8205245200025932)

In [524]:
pd.DataFrame(
    confusion_matrix(y_test, mulnb_model_pipeline.predict_proba(X_test)[:, 1] > thresholds[36]),
    index=[["actual", "actual"], ["negative", "positive"]],
    columns=[["predicted", "predicted"], ["negative", "positive"]],
)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,negative,positive
actual,negative,31,7
actual,positive,7,30
