In [192]:
import pandas as pd
import numpy as np
import string 
import json
import re
import pickle
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPooling2D
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import ndcg_score
import time
import seaborn as sns
from xgboost import XGBRanker
from xgboost import plot_importance
from sklearn.model_selection import GroupShuffleSplit

In [193]:
df = pd.read_csv('dataset_toxic_cos_pn.csv', encoding='utf-8')

In [194]:
df = df.drop(['text', 'comments'], axis=1)

In [195]:
group = []
for k in range(df.shape[0]//5):
    group += [k]*5
df['group'] = np.array(group)

In [196]:
df_cat = df[['group', 'score', 'cos_sim', 'positive']]
df_numerics = df[['words_num', 'mistakes_num', 'letters_num', 'stopwords_num',
       'stopwords_percent_num', 'upper_words_num', 'unique_num',
       'unique_percent_num', 'words_mean_len', 'punct_num',
       'first_capital_num', 'url_num']]

In [197]:
numeric_scaler = StandardScaler()
# Fit & transform train set
df_numerics_scaled = numeric_scaler.fit_transform(df_numerics)
df_numerics_scaled = pd.DataFrame(df_numerics_scaled, columns=df_numerics.columns, index=df_numerics.index)

In [198]:
df_scaled = pd.concat([df_cat, df_numerics_scaled], axis=1)

In [199]:
gss = GroupShuffleSplit(test_size=0.30, n_splits=1, random_state = 7).split(df_scaled, groups=df_scaled['group'])

In [200]:
X_train_inds, X_test_inds = next(gss)

In [201]:
df_train= df_scaled.iloc[X_train_inds]
features_train = df_train.loc[:, ~df_train.columns.isin(['group','score'])]
target_train = df_train.loc[:, df_train.columns.isin(['score'])]

df_test= df_scaled.iloc[X_test_inds]
# сохренение идентификаторов групп для предсказаний
features_test = df_test.loc[:, ~df_test.columns.isin(['score'])]
target_test = df_test.loc[:, df_test.columns.isin(['score'])]

In [202]:
groups = df_train.groupby('group').size().to_frame('size')['size'].to_numpy()

In [203]:
model_ranker = XGBRanker(objective='rank:ndcg', 
                  n_estimators=150,
                  learning_rate=0.1,
                  importance_type='weight', 
                  random_state=42)
             

In [204]:
print(f'Size features_train:{features_train.shape}\nSize target_train:{target_train.shape}')

Size features_train:(308370, 14)
Size target_train:(308370, 1)


In [205]:
start = time.time()
model_ranker.fit(features_train, target_train, group=groups, verbose=True)
end = time.time()
print("The time of execution of above program is :", (end-start))

The time of execution of above program is : 17.668813467025757


In [206]:
feature = np.reshape(np.array(features_train.columns), (features_train.shape[1], -1))
importance = np.reshape(model_ranker.feature_importances_, (features_train.shape[1], -1))*100
feature_importance = np.concatenate((feature, importance), axis=1)

In [207]:
importance = pd.DataFrame(feature_importance, columns=['feature', 'importance']).sort_values('importance', ascending=False)

In [208]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['group'])])
  

In [209]:
start = time.time()
predictions = (features_test.groupby('group')
               .apply(lambda x: predict(model_ranker, x)))
end = time.time()
print("The time of execution of above program is :", (end-start))

The time of execution of above program is : 81.96629738807678


In [210]:
res = np.array(predictions)

In [211]:
num = len(res)
k = [[0]*5 for i in range(num)]
for i in range(num):
    for j in range(4):
        for l in range(1, 5-j):
            if res[i][j] > res[i][j+l]:
                k[i][j] += 1
            else:
                k[i][j+l] += 1

In [212]:
y_res = np.array(k)

In [213]:
y = np.reshape(np.array(target_test), (-1, 5))

In [214]:
y

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       ...,
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]], dtype=int64)

In [215]:
from sklearn.metrics import ndcg_score
ndcg_score(y, y_res)

0.8855334694966525