In [1]:
import pandas as pd
import dask.dataframe as dd

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.metrics import f1_score  # (average=’macro’)
from sklearn.model_selection import train_test_split

import datetime

### Read data

In [2]:
data_train = pd.read_csv('../data/data_train.csv')
data_test = pd.read_csv('../data/data_test.csv')

In [3]:
%%time
data_features = dd.read_csv('../data/features.csv', sep='\t')

CPU times: total: 15.6 ms
Wall time: 34.5 ms


In [4]:
data_features.head()

Unnamed: 0.1,Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,...,243,244,245,246,247,248,249,250,251,252
0,0,2013026,1531688400,18.910029,46.980888,4.969214,-1.386798,3.791754,-14.01179,-16.08618,...,-977.373846,-613.770792,-25.996269,-37.630448,-301.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
1,1,2014722,1539550800,36.690029,152.400888,448.069214,563.833202,463.841754,568.99821,-16.08618,...,-891.373846,-544.770792,-20.996269,48.369552,80.252276,-13.832889,-0.694428,-1.175933,-0.45614,0.0
2,2,2015199,1545598800,-67.019971,157.050888,-63.180786,178.103202,-68.598246,156.99821,3.51382,...,-977.373846,-613.770792,-12.996269,-37.630448,10829.252276,-25.832889,-0.694428,-12.175933,-0.45614,0.0
3,3,2021765,1534107600,7.010029,150.200888,-6.930786,216.213202,76.621754,351.84821,-16.08618,...,-973.373846,-613.770792,-23.996269,-37.630448,-205.747724,-24.832889,-0.694428,-11.175933,-0.45614,1.0
4,4,2027465,1533502800,-90.439971,134.220888,-104.380786,153.643202,-109.798246,132.53821,-16.08618,...,1643.626154,2007.229208,206.003731,-21.630448,6667.252276,92.167111,-0.694428,49.824067,47.54386,0.0


In [5]:
data_train.head()

Unnamed: 0.1,Unnamed: 0,id,vas_id,buy_time,target
0,0,540968,8.0,1537131600,0.0
1,1,1454121,4.0,1531688400,0.0
2,2,2458816,1.0,1534107600,0.0
3,3,3535012,5.0,1535922000,0.0
4,4,1693214,1.0,1535922000,0.0


### Merge datasets

In [6]:
train_ids = list(data_train.id)
test_ids = list(data_test.id)

In [7]:
%%time
data_features_train = data_features[data_features.id.isin(train_ids)].compute()
data_features_test = data_features[data_features.id.isin(test_ids)].compute()

CPU times: total: 3min 25s
Wall time: 3min 29s


In [8]:
# sorting datasets to merge correctly
data_train = data_train.sort_values(by='id')
data_features_train = data_features_train.sort_values(by='id')

data_test = data_test.sort_values(by='id')
data_features_test = data_features_test.sort_values(by='id')

In [9]:
data_train = pd.merge_asof(data_train, data_features_train, on='id', by='buy_time', direction='nearest')
data_test = pd.merge_asof(data_test, data_features_test, on='id', by='buy_time', direction='nearest')

In [10]:
data_train.shape

(831653, 259)

In [11]:
data_test.shape

(71231, 258)

In [12]:
data_train.head(3)

Unnamed: 0,Unnamed: 0_x,id,vas_id,buy_time,target,Unnamed: 0_y,0,1,2,3,...,243,244,245,246,247,248,249,250,251,252
0,116,2,2.0,1545598800,0.0,2966746,-96.799971,229.530888,-110.740786,305.723202,...,2300.626154,1492.229208,-21.996269,-35.630448,368.252276,11.167111,7.305572,-12.175933,-0.45614,0.0
1,213,4,1.0,1533502800,0.0,1950967,-19.599971,-177.419112,-25.910786,-206.286798,...,-977.373846,-613.770792,-25.996269,-37.630448,-299.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
2,499,15,1.0,1534107600,0.0,3079915,-96.799971,-336.159112,-110.740786,-329.456798,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0


### Explore datasets

In [13]:
# Check missing values
train_nan_val = 0
test_nan_val = 0

for col in data_train.columns:
    if data_train.shape[0] - data_train[col].isna().count() != 0:
        train_nan_val += 1

for col in data_test.columns:
    if data_test.shape[0] - data_test[col].isna().count() != 0:
        test_nan_val += 1
print(f"Missing values in train dataset: {train_nan_val}")
print(f"Missing values in test dataset: {test_nan_val}")

Missing values in train dataset: 0
Missing values in test dataset: 0


In [14]:
# distribution of the target variable
data_train.target.value_counts()

0.0    771467
1.0     60186
Name: target, dtype: int64

Сlasses are disbalanced.

### Prepare datasets

In [15]:
# drop "Unnamed: ..." columns
data_train.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'], inplace=True)
data_test.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'], inplace=True)

In [16]:
data_train.sort_values(by='buy_time', inplace=True)
data_test.sort_values(by='buy_time', inplace=True)

In [25]:
# choose date to split dataset
timestamp_sep = int(datetime.datetime.strptime('01.12.2018 00:00:00', '%d.%m.%Y %H:%M:%S').timestamp())
print(data_train[data_train.buy_time < timestamp_sep].shape[0] / data_train.shape[0], "% in train.")
print("timestamp val:", timestamp_sep)

0.7023037252315569 % in train.
timestamp val: 1543611600


In [29]:
df_train = data_train[data_train.buy_time < timestamp_sep]
df_test = data_train[data_train.buy_time >= timestamp_sep]

### Split datasets into train and test subsets 

In [30]:
X_train = df_train.drop(columns=['target'])
y_train = df_train.target
X_test = df_test.drop(columns=['target'])
y_test = df_test.target