In [0]:
# Не трожь, оно тебя сожрет!
!pip install catboost

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import catboost
sns.set()

from preprocessing import preprocess_customer, preprocess_reactions
from answering import choose_from_proba

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [0]:
path = '/content/'

cust = pd.read_csv(path + 'customer_test.csv')
transactions = pd.read_csv(path + 'transactions.csv')
react_train = pd.read_csv(path + 'stories_reaction_train.csv')
react_test = pd.read_csv(path + 'stories_reaction_test.csv')
descr = pd.read_csv(path + 'stories_description.csv')
sample = pd.read_csv(path + 'sample_submit.csv')

In [3]:
new_cust = preprocess_customer(cust, transactions, react_train, encodings=['mean', 'frequency', 'std'])
train, test = preprocess_reactions(react_train, react_test, encodings = ['mean', 'frequency', 'std'])

train = train.join(new_cust.set_index('customer_id'), on = 'customer_id')
test = test.join(new_cust.set_index('customer_id'), on = 'customer_id')

like, view, skip, dislike =  [1 3 2 0]


In [0]:
from sklearn.utils import shuffle

train = shuffle(train, random_state=42).reset_index(drop=True)

In [0]:
y = train['event']
train.drop('event', axis=1, inplace=True)

In [6]:
train.shape

(473141, 244)

In [0]:
num_all = train.shape[0]
first_level = int(0.6 * num_all)
second_level = int(0.8 * num_all)

X_first = train[:first_level]
X_second = train[first_level:second_level]
X_val = train[second_level:]

y_first = y[:first_level]
y_second = y[first_level:second_level]
y_val = y[second_level:]

In [0]:
train_meta = pd.DataFrame()
val_meta = pd.DataFrame()
test_meta = pd.DataFrame()

In [0]:
def add_preds(name, train_probas, val_probas, test_probas):
  for i, cl in enumerate(['_dislike', '_like', '_skip', '_view']):
    train_meta[name + cl] = train_probas[:,i]
    val_meta[name + cl] = val_probas[:,i]    
    test_meta[name + cl] = test_probas[:,i]    

In [0]:
def custom_metric(y_test, y_true):
  p_dislike = y_test[:, 0]
  p_like = y_test[:, 1]
  p_skip = y_test[:, 2]
  p_view = y_test[:, 3]
  exp = -10 * p_dislike - 0.1 * p_skip + 0.1 * p_view + 0.5 * p_like
  res = pd.Series(exp).apply(lambda x: 1 if x > 0 else -1)
  
  s = 0
  real = 0
  for i in range(res.shape[0]):
    if y_true[i] == 0:
      s -= res[i] * 10
      real += 10
    elif y_true[i] == 1:
      s += res[i] * 0.5
      real += 0.5
    elif y_true[i] == 2:
      s -= res[i] * 0.1
      real += 0.1
    else:
      s += res[i] * 0.1
      real += 0.1
  return s / real

## Catboost ##

In [12]:
model = catboost.CatBoostClassifier(
    loss_function='MultiClass',
    task_type='GPU',
    custom_loss=['Accuracy']
)

train_pool = catboost.Pool(data=X_first.values, label=y_first.values)

model.fit(train_pool, logging_level='Silent')

<catboost.core.CatBoostClassifier at 0x7fd28fbea7b8>

In [13]:
y_val_pred = model.predict(X_second)
accuracy_score(y_second, y_val_pred)

0.7149258147694129

In [14]:
custom_metric(model.predict_proba(X_second), y_second.values)

0.6787344884111665

In [0]:
train_probas = model.predict_proba(X_second)
val_probas = model.predict_proba(X_val)
test_probas = model.predict_proba(test)

add_preds('catboost', train_probas, val_probas, test_probas)

## xgboost ##

In [0]:
import xgboost as xgb

In [0]:
xgb_first = xgb.DMatrix(X_first.values, label=y_first.values)
xgb_second = xgb.DMatrix(X_second.values, label=y_second.values)
xgb_val = xgb.DMatrix(X_val.values, label=y_val.values)
xgb_test = xgb.DMatrix(test.values)

In [0]:
param = {'gpu_id' : 0, 'tree_method' : 'gpu_hist', 'max_bin' : 1024, 'objective' : 'multi:softmax', 'seed' : 42,
        'num_class' : 4}

tree_boosting = xgb.train(param, xgb_first, num_boost_round = 50)

In [19]:
y_val_pred = tree_boosting.predict(xgb_second)
accuracy_score(y_second, y_val_pred)

0.7141649406095447

In [20]:
from scipy.special import softmax

margins = tree_boosting.predict(xgb_second, output_margin=True)
probas = softmax(margins, axis=1)
custom_metric(probas, y_second.values)

0.6756394872404831

In [0]:
train_margins = tree_boosting.predict(xgb_second, output_margin=True)
train_probas = softmax(train_margins, axis=1)

val_margins = tree_boosting.predict(xgb_val, output_margin=True)
val_probas = softmax(val_margins, axis=1)

test_margins = tree_boosting.predict(xgb_test, output_margin=True)
test_probas = softmax(test_margins, axis=1)

add_preds('xgboost_boosting', train_probas, val_probas, test_probas)

## NN ##

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(np.concatenate([X_first, X_second, X_val, test.values]))

In [0]:
X_first_scaled = scaler.transform(X_first)
X_second_scaled = scaler.transform(X_second)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test)

In [0]:
import tensorflow.keras as keras

In [0]:
y_first_cat = keras.utils.to_categorical(y_first)
y_second_cat = keras.utils.to_categorical(y_second)
y_val_cat = keras.utils.to_categorical(y_val)

In [26]:
model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=(244)),
    keras.layers.Dense(256, activation='tanh'),
    keras.layers.Dense(128, activation='tanh'),
    keras.layers.Dense(64, activation='tanh'),
    keras.layers.Dense(16, activation='tanh'),
    keras.layers.Dense(4, activation='softmax'),
])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [27]:
model.compile('adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_first_scaled, y_first_cat, epochs=5, validation_data=(X_second_scaled, y_second_cat))

Train on 283884 samples, validate on 94628 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd277251668>

In [28]:
custom_metric(model.predict(X_second_scaled), y_second.values)

0.6537257082660539

In [0]:
train_probas = model.predict(X_second_scaled)
val_probas = model.predict(X_val_scaled)
test_probas = model.predict(test_scaled)

add_preds('NN', train_probas, val_probas, test_probas)

In [0]:
train_meta.to_csv('train_meta.csv', index=False)
val_meta.to_csv('val_meta.csv', index=False)
test_meta.to_csv('test_meta.csv', index=False)

## Random Forest ##

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
model = RandomForestClassifier(100)
model.fit(X_first, y_first)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
y_val_pred = model.predict(X_second)
accuracy_score(y_second, y_val_pred)

0.7037663270913471

In [19]:
custom_metric(model.predict_proba(X_second), y_second.values)

0.5905525638034356

In [0]:
train_probas = model.predict_proba(X_second)
val_probas = model.predict_proba(X_val)
test_probas = model.predict_proba(test)

add_preds('random_forest', train_probas, val_probas, test_probas)

## Финальный бленд ##

In [24]:
model = catboost.CatBoostClassifier(
    loss_function='MultiClass',
    task_type='GPU',
    custom_loss=['Accuracy']
)

train_pool = catboost.Pool(data=train_meta.values, label=y_second.values)

model.fit(train_pool, logging_level='Silent')

<catboost.core.CatBoostClassifier at 0x7f10613a5048>

In [26]:
y_val_pred = model.predict(val_meta.values)
accuracy_score(y_val, y_val_pred)

0.7122552283126737

In [27]:
custom_metric(model.predict_proba(val_meta.values), y_val.values)

0.6751179508560535

In [0]:
sample['score'] = choose_from_proba(model, test_meta.values)

In [0]:
sample.to_csv('try_stacking.csv', index=False)

In [35]:
imp = zip(model.feature_importances_, test_meta.columns)
print(*sorted(imp, reverse=True), sep='\n')

(13.306554480773437, 'xgboost_boosting_dislike')
(12.109943534104733, 'xgboost_boosting_like')
(9.797578929818108, 'catboost_like')
(8.932699264562501, 'catboost_view')
(7.814698148453407, 'catboost_dislike')
(6.978581114058881, 'NN_dislike')
(6.7845021985819844, 'xgboost_boosting_view')
(6.546300247986658, 'NN_like')
(5.223641213063292, 'catboost_skip')
(4.309676719217337, 'random_forest_like')
(4.276191236511381, 'xgboost_boosting_skip')
(3.739239803386922, 'NN_view')
(3.1889435319115593, 'random_forest_view')
(2.8613245952725292, 'random_forest_skip')
(2.6498210473495676, 'NN_skip')
(1.4803039349477292, 'random_forest_dislike')
