In [1]:
import pandas as pd
import numpy as np
import string 
import json
import re
from spellchecker import SpellChecker
from nltk.corpus import stopwords as nltk_stopwords
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPooling2D
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ndcg_score

import seaborn as sns
from catboost import CatBoostClassifier, Pool

In [2]:
df = pd.read_csv('dataset_train.csv', encoding='utf-8')

In [3]:
def create_score(x):
    if x in [0,1,2]:
        return 1
    else:
        return 0

In [4]:
# Создание бинарного скора
df['score_bin'] = df['score'].apply(create_score)

In [5]:
group = []
for k in range(df.shape[0]//5):
    group += [k]*5
df['group'] = np.array(group)


In [6]:
index = np.random.choice(df.shape[0]//5, 50000, replace=False) 

In [7]:
df = df.drop(['text', 'comments'], axis=1)

In [8]:
train = df[df['group'].isin(index)]

In [9]:
test = df[~df['group'].isin(index)]

In [10]:
train = train.sample(frac=1)

In [11]:
features_train = train[['words_num', 'mistakes_num', 'letters_num', 'stopwords_num',
       'stopwords_percent_num', 'upper_words_num', 'unique_num',
       'unique_percent_num', 'words_mean_len', 'punct_num',
       'first_capital_num', 'url_num', 'cos_sim', 'positive']]
target_train = train[['score_bin']]

features_test = test[['words_num', 'mistakes_num', 'letters_num', 'stopwords_num',
       'stopwords_percent_num', 'upper_words_num', 'unique_num',
       'unique_percent_num', 'words_mean_len', 'punct_num',
       'first_capital_num', 'url_num', 'cos_sim', 'positive']]
target_test = test[['score_bin']]

In [12]:
model_CatBoost = CatBoostClassifier(loss_function='Logloss',
                           verbose=True)

In [13]:
model_CatBoost.fit(features_train, target_train)

Learning rate set to 0.108857
0:	learn: 0.6810153	total: 185ms	remaining: 3m 4s
1:	learn: 0.6713432	total: 215ms	remaining: 1m 47s
2:	learn: 0.6636056	total: 243ms	remaining: 1m 20s
3:	learn: 0.6576373	total: 271ms	remaining: 1m 7s
4:	learn: 0.6528487	total: 303ms	remaining: 1m
5:	learn: 0.6491569	total: 335ms	remaining: 55.5s
6:	learn: 0.6461628	total: 369ms	remaining: 52.4s
7:	learn: 0.6436639	total: 403ms	remaining: 49.9s
8:	learn: 0.6417200	total: 437ms	remaining: 48.1s
9:	learn: 0.6400326	total: 468ms	remaining: 46.3s
10:	learn: 0.6387608	total: 499ms	remaining: 44.9s
11:	learn: 0.6377254	total: 530ms	remaining: 43.6s
12:	learn: 0.6367831	total: 561ms	remaining: 42.6s
13:	learn: 0.6360336	total: 592ms	remaining: 41.7s
14:	learn: 0.6355187	total: 623ms	remaining: 40.9s
15:	learn: 0.6350397	total: 655ms	remaining: 40.3s
16:	learn: 0.6346448	total: 682ms	remaining: 39.4s
17:	learn: 0.6343324	total: 713ms	remaining: 38.9s
18:	learn: 0.6340781	total: 742ms	remaining: 38.3s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x1e8bacbe1f0>

In [14]:
model_CatBoost.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,letters_num,12.381422
1,punct_num,10.745053
2,positive,9.067429
3,cos_sim,9.000296
4,words_mean_len,7.898503
5,stopwords_percent_num,7.352241
6,unique_num,7.075839
7,first_capital_num,6.527184
8,unique_percent_num,6.025194
9,stopwords_num,5.734346


In [15]:
preds_prob = model_CatBoost.predict_proba(features_test)

In [16]:
probabilities = np.reshape(preds_prob[:, 1], (-1, 5))


In [17]:
num = len(probabilities)
k = [[0]*5 for i in range(num)]
for i in range(num):
    for j in range(4):
        for l in range(1, 5-j):
            if probabilities[i][j] < probabilities[i][j+l]:
                k[i][j] += 1
            else:
                k[i][j+l] += 1

In [18]:
res_rank = np.array(k)

In [19]:
y_true = np.reshape(np.array(test['score']), (-1, 5))

In [20]:
ndcg_score(y_true, res_rank)

0.8830129125102807

# Обучение нейросети с добавленным признаком


In [21]:
test.insert(loc = 13,
          column = 'cat_boost_res',
          value = preds_prob[:, 1])

In [22]:
df_for_nn = test.drop('group', axis=1).copy()

In [23]:
group = []
for k in range(df_for_nn.shape[0]//5):
    group += [k]*5
df_for_nn['group'] = np.array(group)

In [24]:
num_train = df_for_nn.shape[0]//5

In [25]:
index_nn = np.random.choice(num_train, int(0.7*num_train), replace=False) 

In [26]:
train_nn = df_for_nn[df_for_nn['group'].isin(index_nn)] # Train выборка (рандомная)
test_nn = df_for_nn[~df_for_nn['group'].isin(index_nn)] # Validation выборка (рандомная)

In [27]:
print(f'train_nn size: {train_nn.shape}\ntest_nn size: {test_nn.shape}')

train_nn size: (133370, 18)
test_nn size: (57165, 18)


In [28]:
features_train_nn = train_nn[['words_num', 'mistakes_num', 'letters_num', 'stopwords_num',
       'stopwords_percent_num', 'upper_words_num', 'unique_num',
       'unique_percent_num', 'words_mean_len', 'punct_num',
       'first_capital_num', 'url_num', 'cos_sim', 'positive', 'cat_boost_res']]
target_train_nn = train_nn[['score_bin']]

features_test_nn = test_nn[['words_num', 'mistakes_num', 'letters_num', 'stopwords_num',
       'stopwords_percent_num', 'upper_words_num', 'unique_num',
       'unique_percent_num', 'words_mean_len', 'punct_num',
       'first_capital_num', 'url_num', 'cos_sim', 'positive', 'cat_boost_res']]
target_test_nn = test_nn[['score_bin']]

# Создание модели

In [29]:
Nnum1 = 128
actfun = 'relu'
model_NN = keras.Sequential([
    Dense(Nnum1, activation=actfun),
    Dense(Nnum1/2, activation=actfun),
    Dense(Nnum1/2, activation=actfun),
    Dense(1, activation='sigmoid')
])

In [30]:
model_NN.compile(optimizer='adam',
    loss='binary_crossentropy')

In [31]:
# На вход модели подается матрица 5х15 (Что соответствует блоку с одним постом и 5 комментариями)
his = model_NN.fit(features_train_nn, target_train_nn, batch_size=500, epochs=20, validation_data=(features_test_nn, target_test_nn))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [32]:
res = model_NN.predict(features_test_nn)



# Перевести вероятности в числа (0, 1, 2, 3, 4)

In [33]:
res = np.reshape(res, (-1, 5))

In [34]:
num = len(res)
k = [[0]*5 for i in range(num)]
for i in range(num):
    for j in range(4):
        for l in range(1, 5-j):
            if res[i][j] < res[i][j+l]:
                k[i][j] += 1
            else:
                k[i][j+l] += 1


In [35]:
res_rank = np.array(k)

In [36]:
y_true_nn = np.reshape(np.array(test_nn['score']), (-1, 5))

In [37]:
ndcg_score(y_true_nn, res_rank)

0.8861578659017847