In [33]:
import pandas as pd
import dask.dataframe as dd

In [116]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import csr_matrix
from sklearn.metrics import f1_score

from implicit import als

from random import choice

from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression

import warnings

warnings.filterwarnings("ignore")

### Подготовка данных:

In [35]:
# data_train = pd.read_csv('data_train.csv', index_col='Unnamed: 0')
# data_test = pd.read_csv('data_test.csv', index_col='Unnamed: 0')

data_train = dd.read_csv('data_train.csv')
data_train = data_train.drop(columns='Unnamed: 0')
data_test = dd.read_csv('data_test.csv')
data_test = data_test.drop(columns='Unnamed: 0')

In [36]:
data_train.head(5)

Unnamed: 0,id,vas_id,buy_time,target
0,540968,8.0,1537131600,0.0
1,1454121,4.0,1531688400,0.0
2,2458816,1.0,1534107600,0.0
3,3535012,5.0,1535922000,0.0
4,1693214,1.0,1535922000,0.0


In [37]:
data_train.shape[0].compute()

831653

In [38]:
train_id = list(data_train['id'])

In [39]:
len(train_id)

831653

In [40]:
data_test.head(5)

Unnamed: 0,id,vas_id,buy_time
0,3130519,2.0,1548018000
1,2000860,4.0,1548018000
2,1099444,2.0,1546808400
3,1343255,5.0,1547413200
4,1277040,2.0,1546808400


In [41]:
print('(', data_test.shape[0].compute(), ',', data_test.shape[1], ')')

( 71231 , 3 )


In [42]:
test_id = list(data_test['id'].unique())

In [43]:
features = dd.read_csv('features.csv', sep='\t')
features = features.drop(columns='Unnamed: 0')

In [44]:
# pd_features = pd.DataFrame(features.compute())

In [45]:
features.head(5)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,243,244,245,246,247,248,249,250,251,252
0,2013026,1531688400,18.910029,46.980888,4.969214,-1.386798,3.791754,-14.01179,-16.08618,-65.076097,...,-977.373846,-613.770792,-25.996269,-37.630448,-301.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
1,2014722,1539550800,36.690029,152.400888,448.069214,563.833202,463.841754,568.99821,-16.08618,-53.216097,...,-891.373846,-544.770792,-20.996269,48.369552,80.252276,-13.832889,-0.694428,-1.175933,-0.45614,0.0
2,2015199,1545598800,-67.019971,157.050888,-63.180786,178.103202,-68.598246,156.99821,3.51382,25.183903,...,-977.373846,-613.770792,-12.996269,-37.630448,10829.252276,-25.832889,-0.694428,-12.175933,-0.45614,0.0
3,2021765,1534107600,7.010029,150.200888,-6.930786,216.213202,76.621754,351.84821,-16.08618,-65.076097,...,-973.373846,-613.770792,-23.996269,-37.630448,-205.747724,-24.832889,-0.694428,-11.175933,-0.45614,1.0
4,2027465,1533502800,-90.439971,134.220888,-104.380786,153.643202,-109.798246,132.53821,-16.08618,-65.076097,...,1643.626154,2007.229208,206.003731,-21.630448,6667.252276,92.167111,-0.694428,49.824067,47.54386,0.0


In [14]:
# train_features = pd_features[pd_features['id'].isin(train_id)]

In [46]:
train_features = features[features['id'].isin(train_id)].reset_index(drop=True)

In [47]:
train_features.head(5)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,243,244,245,246,247,248,249,250,251,252
0,2046132,1534712400,300.820029,1599.480888,286.879214,1585.013202,281.461754,1563.90821,-16.08618,654.013903,...,-977.373846,-613.770792,-25.996269,-35.630448,-295.747724,-17.832889,-0.694428,-4.175933,-0.45614,0.0
1,2050810,1540760400,-86.209971,91.820888,-84.480786,110.333202,-89.898246,89.22821,-16.08618,-65.076097,...,-977.373846,-613.770792,-23.996269,190.369552,-286.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
2,2070757,1540760400,-96.799971,-408.179112,-110.740786,-460.786798,-114.038246,-479.77179,-16.08618,-65.076097,...,-925.373846,-561.770792,-21.996269,-37.630448,-151.747724,-24.832889,0.305572,-12.175933,-0.45614,1.0
3,2071522,1544994000,-94.939971,-363.699112,-108.880786,-411.226798,-114.298246,-432.33179,-16.08618,-65.076097,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
4,2075318,1533502800,-75.639971,669.690888,-89.580786,732.343202,-94.998246,736.65821,-16.08618,782.383903,...,-501.373846,-242.770792,-25.996269,-37.630448,-167.747724,-14.832889,2.305572,-4.175933,-0.45614,0.0


In [48]:
test_features = features[features['id'].isin(test_id)].reset_index(drop=True)

In [49]:
data_train[['id','vas_id']].reset_index(drop=True).head(5)

Unnamed: 0,id,vas_id
0,540968,8.0
1,1454121,4.0
2,2458816,1.0
3,3535012,5.0
4,1693214,1.0


In [50]:
data_train_with_features = train_features.merge(data_train[['id','vas_id','target']].reset_index(drop=True), on='id')

In [51]:
data_train_with_features.head(5)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,245,246,247,248,249,250,251,252,vas_id,target
0,2046132,1534712400,300.820029,1599.480888,286.879214,1585.013202,281.461754,1563.90821,-16.08618,654.013903,...,-25.996269,-35.630448,-295.747724,-17.832889,-0.694428,-4.175933,-0.45614,0.0,2.0,0.0
1,2050810,1540760400,-86.209971,91.820888,-84.480786,110.333202,-89.898246,89.22821,-16.08618,-65.076097,...,-23.996269,190.369552,-286.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,1.0,0.0
2,2070757,1540760400,-96.799971,-408.179112,-110.740786,-460.786798,-114.038246,-479.77179,-16.08618,-65.076097,...,-21.996269,-37.630448,-151.747724,-24.832889,0.305572,-12.175933,-0.45614,1.0,1.0,0.0
3,2071522,1544994000,-94.939971,-363.699112,-108.880786,-411.226798,-114.298246,-432.33179,-16.08618,-65.076097,...,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,1.0,0.0
4,2075318,1533502800,-75.639971,669.690888,-89.580786,732.343202,-94.998246,736.65821,-16.08618,782.383903,...,-25.996269,-37.630448,-167.747724,-14.832889,2.305572,-4.175933,-0.45614,0.0,2.0,0.0


In [167]:
# data_train_with_features[data_train_with_features['buy_time'] > 1543502800].shape[0].compute()

# #result: 303045

In [114]:
# data_train_with_features[data_train_with_features['buy_time'] < 1543502800].shape[0].compute()

# #result: 557007

In [174]:
# data_train_with_features.loc[(data_train_with_features['buy_time'] < 1543502800) & (data_train_with_features['buy_time'] > 1523502800)].shape[0].compute()

data_train_with_features.head(1000)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,245,246,247,248,249,250,251,252,vas_id,target
0,2046132,1534712400,300.820029,1599.480888,286.879214,1585.013202,281.461754,1563.90821,-16.08618,654.013903,...,-25.996269,-35.630448,-295.747724,-17.832889,-0.694428,-4.175933,-0.45614,0.0,2.0,0.0
1,2050810,1540760400,-86.209971,91.820888,-84.480786,110.333202,-89.898246,89.22821,-16.08618,-65.076097,...,-23.996269,190.369552,-286.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,1.0,0.0
2,2070757,1540760400,-96.799971,-408.179112,-110.740786,-460.786798,-114.038246,-479.77179,-16.08618,-65.076097,...,-21.996269,-37.630448,-151.747724,-24.832889,0.305572,-12.175933,-0.45614,1.0,1.0,0.0
3,2071522,1544994000,-94.939971,-363.699112,-108.880786,-411.226798,-114.298246,-432.33179,-16.08618,-65.076097,...,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,1.0,0.0
4,2075318,1533502800,-75.639971,669.690888,-89.580786,732.343202,-94.998246,736.65821,-16.08618,782.383903,...,-25.996269,-37.630448,-167.747724,-14.832889,2.305572,-4.175933,-0.45614,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,3946175,1544994000,-79.289971,-365.479112,-93.230786,-418.086798,-98.648246,-439.19179,-16.08618,-65.076097,...,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,1.0,0.0
1496,3946310,1533502800,-96.799971,-103.949112,-110.740786,-156.556798,-116.158246,-177.66179,-16.08618,-65.076097,...,-25.996269,-37.630448,275.252276,-21.832889,-0.694428,-8.175933,-0.45614,0.0,2.0,0.0
1497,3946954,1535317200,-74.229971,-302.189112,-88.170786,-354.796798,-93.588246,-375.90179,-16.08618,-65.076097,...,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0,2.0,0.0
1498,3947390,1546203600,-84.149971,-145.519112,-98.090786,-172.706798,-103.508246,-193.81179,-16.08618,-65.076097,...,-25.996269,-36.630448,-279.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0,5.0,0.0


In [177]:
data_train_with_features.tail(1500)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,245,246,247,248,249,250,251,252,vas_id,target
909,1708105,1544994000,386.250029,74.870888,372.309214,22.263202,366.891754,1.15821,466.96382,417.973903,...,-25.996269,-29.630448,942.252276,-25.832889,-0.694428,-12.175933,-0.45614,0.0,1.0,0.0
910,1709964,1546808400,129.010029,502.710888,122.569214,515.893202,117.151754,494.78821,-16.08618,-65.076097,...,-24.996269,-25.630448,-186.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0,2.0,0.0
911,1716742,1533502800,78.470029,-25.289112,64.529214,-60.946798,59.111754,-82.05179,-16.08618,-65.076097,...,-25.996269,-37.630448,-110.747724,2.167111,-0.694428,15.824067,-0.45614,0.0,2.0,0.0
912,1718697,1543784400,51.530029,191.070888,37.589214,168.123202,32.171754,147.01821,61.05382,249.413903,...,119.003731,84.369552,1622.252276,235.167111,-0.694428,-12.175933,-0.45614,1.0,5.0,0.0
913,1719033,1537131600,-62.899971,-27.699112,-76.840786,-76.066798,-82.258246,-97.17179,-16.08618,181.903903,...,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2404,3513825,1546808400,-9.299971,-231.429112,26.779214,-183.566798,21.361754,-204.67179,-16.08618,-65.076097,...,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,1.0,1.0,0.0
2405,3513869,1548018000,82.370029,-123.429112,155.939214,-88.526798,150.521754,-109.63179,-16.08618,-65.076097,...,-23.996269,-37.630448,-271.747724,-22.832889,-0.694428,-12.175933,-0.45614,0.0,1.0,0.0
2406,3517434,1548018000,-96.799971,-284.349112,-100.740786,-274.796798,-106.158246,-295.90179,-16.08618,-65.076097,...,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,2.0,0.0
2407,3519714,1546808400,167.280029,110.140888,153.339214,57.533202,147.921754,36.42821,-2.00618,-50.996097,...,-14.996269,7.369552,-180.747724,-19.832889,0.305572,-12.175933,-0.45614,0.0,2.0,0.0


In [None]:
# X_train = data_train_with_features[data_train_with_features['buy_time'] < 1543502800].drop(columns='target')
# y_train = data_train_with_features[data_train_with_features['buy_time'] < 1543502800]['target']

# X_train_test = data_train_with_features[data_train_with_features['buy_time'] > 1543502800].drop(columns='target')
# y_train_test = data_train_with_features[data_train_with_features['buy_time'] > 1543502800]['target']

In [211]:
X_train = data_train_with_features.head(1000).drop(columns='target')
y_train = data_train_with_features.head(1000)['target']

X_train_test = data_train_with_features.tail(1500).drop(columns='target')
y_train_test = data_train_with_features.tail(1500)['target']

In [213]:
feature_names = list(X_train.columns)
lim = int(len(feature_names)/2)
# print(choice(range(2,lim)))
print(feature_names)

['id', 'buy_time', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155',

In [182]:
# a_choice = feature_names[2:lim]
# b_choice = feature_names[lim+2:]
# a = choice(a_choice)
# b = choice(b_choice)

In [208]:
# ['id'] + feature_names[lim+2:]

In [214]:
model = MLPClassifier(alpha=0.000001, max_iter=50, solver='sgd', random_state=42)

In [219]:
def features_selector(model, X_train, y_train, X_train_test, y_true):
    best_features = []
    best_score = 0
    feature_names = list(X_train.columns)
    lim = int(len(feature_names)/2)
    # a_choice = feature_names[2:lim]
    # b_choice = feature_names[lim+2:]
    for i in range(150):
        a = 2+i
        b = int(len(feature_names) - (i-1)/2)
        model.fit(X_train[feature_names[a:b]], y_train)
        y_pred = model.predict(X_train_test[feature_names[a:b]])
        if f1_score(y_true, y_pred, average='macro') > best_score:
            best_score = f1_score(y_true, y_pred, average='macro')
        best_features = feature_names[a:b]
    return best_score, best_features

In [197]:
# print(features_selector(model, X_train, y_train, X_train_test, y_train_test))

In [216]:
feature_names = features_selector(model, X_train, y_train, X_train_test, y_train_test)[1]

In [220]:
X_train = data_train_with_features[feature_names]
y_train = data_train_with_features['target']

In [223]:
model.fit(X_train, y_train)

MLPClassifier(alpha=1e-06, max_iter=50, random_state=42, solver='sgd')

In [224]:
data_test_with_features = test_features.merge(data_test[['id','vas_id']].reset_index(drop=True), on='id')

In [226]:
data_test_with_features.head(5)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,244,245,246,247,248,249,250,251,252,vas_id
0,2522732,1539550800,-93.749971,-64.949112,-99.210786,-100.596798,-104.628246,-103.92179,-16.08618,-65.076097,...,-613.770792,-17.996269,-36.630448,-212.747724,2.167111,-0.694428,-12.175933,-0.45614,1.0,5.0
1,3504271,1548018000,386.530029,75.150888,372.589214,39.133202,367.171754,18.02821,-16.08618,-65.076097,...,-545.770792,-25.996269,-37.630448,-231.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0,5.0
2,3556316,1539550800,-96.799971,-183.599112,-110.740786,-236.206798,-116.158246,-257.31179,-16.08618,-65.076097,...,-534.770792,-25.996269,-37.630448,-47.747724,-22.832889,0.305572,-11.175933,-0.45614,0.0,2.0
3,3578987,1532898000,81.140029,32.440888,70.589214,-13.386798,65.171754,-34.49179,-16.08618,-59.146097,...,-578.770792,-25.996269,-37.630448,-111.747724,41.167111,-0.694428,54.824067,-0.45614,0.0,2.0
4,3662585,1533502800,-96.799971,-372.579112,-110.740786,-425.186798,-116.158246,-446.29179,-16.08618,-65.076097,...,-607.770792,-25.996269,-29.630448,-255.747724,-22.832889,-0.694428,-9.175933,-0.45614,1.0,2.0


In [227]:
X_test = data_test_with_features[feature_names]
y_pred = model.predict(X_test[feature_names])

In [230]:
y_pred = pd.DataFrame(y_pred)

In [238]:
y_pred = y_pred.rename({0: "target"}, axis='columns')

In [239]:
result = data_test.merge(y_pred.reset_index(drop=True))

In [241]:
result.head(5)

Unnamed: 0,id,vas_id,buy_time,target
0,3130519,2.0,1548018000,0.0
1,2000860,4.0,1548018000,0.0
2,1099444,2.0,1546808400,0.0
3,1343255,5.0,1547413200,0.0
4,1277040,2.0,1546808400,0.0


In [251]:
result.compute().to_csv('answers_test.csv', index=False)