In [1]:
import requests
import re
import pymorphy2
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
from fake_useragent import UserAgent
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
ua = UserAgent(verify_ssl=False)
headers = {'User-Agent': ua.random}

In [3]:
pages = [
    'https://www.turpravda.com/tn/monastir/One_Resort_Aqua_Park___Spa-h47466.html',
    'https://www.turpravda.com/tn/monastir/magic_caribbean_monastir-h17529.html#reviews',
    'https://www.turpravda.com/tn/monastir/El_Mouradi_Skanes-h17330.html',
    'https://www.turpravda.com/tn/monastir/Delphin_El_Habib_Resort-h23475.html',
    'https://www.turpravda.com/tn/monastir/Liberty_Hotel-h12058.html',
    'https://www.turpravda.com/tn/monastir/Jokey_Club_Palm_Garden-h11867.html',
    'https://www.turpravda.com/tn/monastir/Le_Soleil_Bella_Vista_Resort_Hotel-h11735.html',
    'https://www.turpravda.com/tn/monastir/Thalassa_Village_Skanes-h15773.html',
    'https://www.turpravda.com/tn/monastir/palmyra_holiday_resort_spa-h12610.html',
    'https://www.turpravda.com/tn/port_el_kantaui/lti_bellevue_park-h9490.html',
    'https://www.turpravda.com/tn/mahdija/Iberostar_Royal_El_Mansour_Hotel-h25863.html',
    'https://www.turpravda.com/tn/suss/Royal_Kenz_Hotel_Thalasso___Spa-h16856.html',
    'https://www.turpravda.com/tn/hammamet/Club_President_Hotel-h33254.html',
    'https://www.turpravda.com/tn/port_el_kantaui/El_Hana_Hannibal_Palace-h11766.html',
    'https://www.turpravda.com/tn/monastir/Skanes_El_Hana-h17555.html',
    'https://www.turpravda.com/tn/nabeul/Dessole_Royal_Lido_Resort___Spa-h20035.html',
    'https://www.turpravda.com/tn/hammamet/Safa-h27759.html',
    'https://www.turpravda.com/tn/port_el_kantaui/Residence_Kantaoui-h26203.html',
    'https://www.turpravda.com/tn/nabeul/Dessole_Royal_Lido_Resort___Spa-h20035.html'
]

In [4]:
def get_comments(page_url):
    global df_comments
    
    session = requests.session()
    req = session.get(page_url, headers=headers)
    page = req.text
    soup = BeautifulSoup(page)
    
    for comment in soup.find_all('div', {'class': 'ans_body'}):
        mark = comment.find('span', {'class': 'value'})
        if mark:
            comment_text = comment.find('span', {'class': 'all-text'}).text
            mark = float(mark.text[-4:])
            df_comments = df_comments.append({'comment': comment_text, 
                                              'mark': mark,
                                              'url': page_url}, ignore_index=True)
          
    # return all_marks, all_comments

In [5]:
df_comments = pd.DataFrame(columns=['comment', 'mark', 'url'])
for page in pages:
    get_comments(page) 

In [6]:
df_comments['sentiment'] = df_comments['mark'].apply(lambda x: 1 if x > 5 else 0)
df_comments['sentiment'].value_counts()

1    85
0    72
Name: sentiment, dtype: int64

In [7]:
m = pymorphy2.MorphAnalyzer()
ru_words = re.compile(r'\b[А-Яа-я]+?\b')
sw = stopwords.words('russian')
def clean_text(text):
    tokens = []
    for word in word_tokenize(text):
        if ru_words.search(word):
            if word not in sw:
                tokens.append(m.parse(word.lower())[0].normal_form)
    return tokens

In [8]:
df_comments['tokens'] = df_comments['comment'].apply(clean_text)
df_comments['clean_comment'] = df_comments['tokens'].apply(lambda x: ' '.join(x))

In [24]:
X = df_comments['tokens']
y = df_comments['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=1)

In [10]:
mask_pos = y==1
positive_comments = X_train[mask_pos].tolist()
negative_comments = X_train[~mask_pos].tolist()

positive_corpus = []
negative_corpus = []

for t in positive_comments:
    positive_corpus.extend(t)
for t in negative_comments:
    negative_corpus.extend(t)

In [12]:
cnt_pos = Counter(positive_corpus).most_common(250)
cnt_neg = Counter(negative_corpus).most_common(250)

set_pos = set(dict(cnt_pos).keys())
set_neg = set(dict(cnt_neg).keys())
print('Only positive:')
print(set_pos-set_neg)
print('Only negative:')
print(set_neg-set_pos)

Only positive:
{'довольно', 'тёплый', 'минус', 'отличный', 'готовить', 'выбор', 'сусс', 'супер', 'менять', 'хотеться', 'але', 'що', 'этаж', 'дуже', 'интересный', 'ваш', 'вкусно', 'ждать', 'возле', 'египет', 'магазин', 'для', 'найти', 'внимание', 'з', 'песок', 'что-то', 'чаевой', 'прекрасный', 'достаточно', 'турция', 'шоу', 'вкусный', 'что', 'аэропорт', 'детский', 'туроператор', 'замечательный', 'взрослый', 'отдельный', 'блюдо', 'красивый', 'любой', 'кормить', 'це', 'сыр', 'обычный', 'кстати', 'главное', 'два', 'разный', 'вопрос', 'такси', 'ми', 'сильно', 'душа', 'креветка', 'дорога', 'написать', 'отношение', 'цена', 'бесплатный', 'конечно', 'готель', 'неделя', 'немного', 'там', 'настроение', 'рядом', 'английский', 'приятный', 'разнообразный', 'покупать', 'буть', 'оставить', 'кто', 'монастир', 'качество', 'тунисский', 'впечатление', 'небольшой', 'приятно', 'убрать', 'центр', 'постель', 'целое', 'чисто', 'язык', 'як'}
Only negative:
{'ужас', 'он', 'цвет', 'араб', 'понимать', 'к', 'искать

In [13]:
only_pos = set_pos - set_neg
only_neg = set_neg - set_pos
def sentiment(comments):
    cnt_pos = 0
    cnt_neg = 0
    result = []
    # comment_tokens = clean_text(comment)
    for i, comment in enumerate(comments):
        for token in comment:
            if token in only_pos:
                cnt_pos += 1
            elif token in only_neg:
                cnt_neg += 1
        if cnt_neg > cnt_pos:
            result.append(0)
        else:
            result.append(1)
        cnt_pos = 0
        cnt_neg = 0
    return result

y_pred = sentiment(X_test)

In [14]:
def accuracy_count(y_pred, y_test):
    cnt = 0
    for p, t in zip(y_pred, y_test):
        if p == t:
            cnt += 1
    return cnt / len(y_pred)

accuracy_count(y_pred, y_test)

0.875

In [15]:
X_comments = df_comments['clean_comment']
y_comments = df_comments['sentiment']
X_comments_train, X_comments_test, y_comments_train, y_comments_test = train_test_split(X_comments, y_comments, train_size=.8, random_state=1)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_comments_train)
X_test_vec = vec.transform(X_comments_test)

knn = KNeighborsClassifier()
knn.fit(X_train_vec, y_comments_train)
y_preds = knn.predict(X_test_vec)

In [17]:
accuracy_count(y_preds, y_comments_test)

0.6875

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_comments_train)
X_test_vec = vec.transform(X_comments_test)

clf = LogisticRegression(solver='newton-cg')
clf.fit(X_train_vec, y_comments_train)
y_preds = clf.predict(X_test_vec)

In [19]:
accuracy_count(y_preds, y_comments_test)

0.8125

In [26]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)
X_train_wv = []
y_train_wv = []
empts = []
for i, c in enumerate(X_train):
    new_c = []
    for word in c:
        try:
            emb = model.wv[word]
            new_c.append(emb)
        except:
            empts.append(word)
    if len(new_c) != 0:
        y_train_wv.append(y_train.tolist()[i])
        X_train_wv.append(np.array(new_c).mean(axis=0))
    new_c = []

In [27]:
X_test_wv = []
y_test_wv = []
empts = []
for i, c in enumerate(X_test):
    new_c = []
    for word in c:
        try:
            emb = model.wv[word]
            new_c.append(emb)
        except:
            empts.append(word)
    if len(new_c) != 0:
        y_test_wv.append(y_test.tolist()[i])
        X_test_wv.append(np.array(new_c).mean(axis=0))
    new_c = []

In [28]:
accs = []
for i in range(1, 50):
    clf = KNeighborsClassifier(n_neighbors=i, weights='distance', p=2)
    clf.fit(X_train_wv, y_train_wv)
    y_preds = clf.predict(X_test_wv)
    accs.append(accuracy_count(y_preds, y_test))

In [29]:
max(accs)

0.6129032258064516

In [31]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

accs = []

for i in range(1, 10):
    vec = TfidfVectorizer(ngram_range=(1, i))
    X_train_vec = vec.fit_transform(X_comments_train)
    X_test_vec = vec.transform(X_comments_test)

    clf = GradientBoostingClassifier(n_estimators=70, max_depth=5)
    clf.fit(X_train_vec, y_train)
    y_preds = clf.predict(X_test_vec)
    accs.append(accuracy_count(y_preds, y_test))

In [32]:
np.argmax(accs), max(accs)

(2, 0.8125)

In [33]:
accs = []

for i in range(10, 100, 10):
    vec = TfidfVectorizer(ngram_range=(1, 6))
    X_train_vec = vec.fit_transform(X_comments_train)
    X_test_vec = vec.transform(X_comments_test)

    clf = GradientBoostingClassifier(n_estimators=i, max_depth=5)
    clf.fit(X_train_vec, y_train)
    y_preds = clf.predict(X_test_vec)
    accs.append(accuracy_count(y_preds, y_test))

In [34]:
np.argmax(accs), max(accs)

(4, 0.71875)

In [35]:
from catboost import CatBoostClassifier

accs = []
for i in range(1, 5):
    '''vec = CountVectorizer(ngram_range=(1, i))
    X_train_vec = vec.fit_transform(X_comments_train)
    X_test_vec = vec.transform(X_comments_test)'''

    vec = TfidfVectorizer(ngram_range=(1, i))
    X_train_vec = vec.fit_transform(X_comments_train)
    X_test_vec = vec.transform(X_comments_test)

    clf = CatBoostClassifier(iterations=70, max_depth=10, loss_function='Logloss', learning_rate=.1, verbose=False)
    clf.fit(X_train_vec, y_train)
    y_preds = clf.predict(X_test_vec)
    accs.append(accuracy_count(y_preds, y_test))

In [36]:
max(accs)

0.75

In [52]:
# 0.7307692307692307 при CountVectorizer
# iterations=70, max_depth=10, loss_function='Logloss', learning_rate=.1, verbose=False