In [102]:
import pandas as pd
import numpy as np
import string 
import json
import re
from spellchecker import SpellChecker
from nltk.corpus import stopwords as nltk_stopwords
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPooling2D
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GroupShuffleSplit
import time

# Создание обучающей и валидационной выборки

In [103]:
df = pd.read_csv('dataset_train.csv', encoding='utf-8')

In [104]:
df.columns


Index(['text', 'comments', 'score', 'words_num', 'mistakes_num', 'letters_num',
       'stopwords_num', 'stopwords_percent_num', 'upper_words_num',
       'unique_num', 'unique_percent_num', 'words_mean_len', 'punct_num',
       'first_capital_num', 'url_num', 'cos_sim', 'toxic', 'insult', 'obscene',
       'threat', 'identity_hate', 'positive'],
      dtype='object')

In [105]:
def create_score(x):
    if x in [0,1,2]:
        return 1
    else:
        return 0

In [106]:
df['score_bin'] = df['score'].apply(create_score)

In [107]:
# Создание групп
group = []
for k in range(df.shape[0]//5):
    group += [k]*5
df['group'] = np.array(group)

In [108]:
df_cat = df[['group', 'score', 'score_bin', 'cos_sim', 'positive']]
df_numerics = df[['words_num', 'mistakes_num', 'letters_num', 'stopwords_num',
       'stopwords_percent_num', 'upper_words_num', 'unique_num',
       'unique_percent_num', 'words_mean_len', 'punct_num',
       'first_capital_num', 'url_num']]

In [109]:
numeric_scaler = StandardScaler()
# Fit & transform train set
df_numerics_scaled = numeric_scaler.fit_transform(df_numerics)
df_numerics_scaled = pd.DataFrame(df_numerics_scaled, columns=df_numerics.columns, index=df_numerics.index)
df_scaled = pd.concat([df_cat, df_numerics_scaled], axis=1)

In [110]:
gss = GroupShuffleSplit(test_size=0.30, n_splits=1, random_state = 7).split(df_scaled, groups=df_scaled['group'])

In [111]:
X_train_inds, X_test_inds = next(gss)

In [112]:
df_train= df_scaled.iloc[X_train_inds]
df_train = df_train.sample(frac=1)
features_train = df_train.loc[:, ~df_train.columns.isin(['group', 'score', 'score_bin'])]
target_train = df_train.loc[:, df_train.columns.isin(['score_bin'])]

df_test= df_scaled.iloc[X_test_inds]
# сохренение идентификаторов групп для предсказаний
features_test = df_test.loc[:, ~df_test.columns.isin(['group', 'score', 'score_bin'])]
target_test = df_test.loc[:, df_test.columns.isin(['score_bin'])]
target_test_rank = df_test.loc[:, df_test.columns.isin(['score'])]

In [113]:
print(f'Size features_train:{features_train.shape}\nSize target_train:{target_train.shape}')

Size features_train:(308370, 14)
Size target_train:(308370, 1)


# Создание модели

In [114]:
Nnum1 = 128
actfun = 'tanh'
model_NN = keras.Sequential([
    Dense(Nnum1, activation=actfun),
    Dense(Nnum1/2, activation=actfun),
    Dense(Nnum1/2, activation=actfun),
    Dense(1, activation='sigmoid')
])

In [115]:
model_NN.compile(optimizer='adam',
    loss='binary_crossentropy')

In [116]:
start = time.time()
his = model_NN.fit(features_train, target_train, 
                batch_size=500, 
                epochs=10, 
                validation_data=(features_test, target_test))
end = time.time()
print("The time of execution of above program is :", (end-start))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The time of execution of above program is : 11.850647926330566


In [117]:
res = model_NN.predict(features_test)



In [118]:
res = np.reshape(res, (-1, 5))

In [120]:
y_true = np.reshape(np.array(target_test_rank), (-1, 5))

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       ...,
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]], dtype=int64)

# Перевести вероятности в числа (0, 1, 2, 3, 4)

In [121]:
num = len(res)
k = [[0]*5 for i in range(num)]
for i in range(num):
    for j in range(4):
        for l in range(1, 5-j):
            if res[i][j] < res[i][j+l]:
                k[i][j] += 1
            else:
                k[i][j+l] += 1


In [122]:
res_rank = np.array(k)

In [124]:
ndcg_score(y_true, res_rank)

0.8848911531481145