In [1]:
import feather
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import dask.dataframe as dd
import os
import seaborn as sns


In [2]:
train = pd.read_parquet("../data/raw/train.parquet")
test = pd.read_parquet("../data/raw/test.parquet")


In [3]:
labels = pd.read_csv("../data/raw/train_labels.csv")

In [4]:
train = train.merge(labels, left_on='customer_ID', right_on='customer_ID')

In [5]:
train.shape

(5531451, 191)

In [6]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,0,0,0.0,,0,0.00061,0,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,0,0,0.0,,0,0.005492,0,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,0,0,0.0,,0,0.006986,0,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,0,0,0.0,,0,0.006527,0,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,0,0,0.0,,0,0.008126,0,0


In [7]:
features = train.drop(['customer_ID', 'S_2', 'target'], axis=1).columns.to_list()

cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

num_features = [col for col in features if col not in cat_cols]

In [8]:
train_num_agg = train.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last' ])
train_num_agg.columns = ["_".join(x) for x in train_num_agg.columns]
train_cat_agg = train.groupby("customer_ID")[cat_cols].agg(['count', 'last', 'nunique'])
train_cat_agg.columns = ["_".join(x) for x in train_cat_agg.columns]
train_target = (train.groupby("customer_ID").tail(1).set_index('customer_ID', drop= True).sort_index()["target"])
train = pd.concat([train_num_agg, train_cat_agg, train_target], axis=1)
train.to_pickle("../data/raw/train_agg.pkl", compression = "gzip")

In [9]:
train.shape

(458913, 919)

In [11]:
test_num_agg = test.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
test_cat_agg = test.groupby("customer_ID")[cat_cols].agg(['count', 'last', 'nunique'])
test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
test = pd.concat([test_num_agg, test_cat_agg], axis=1)

test.to_pickle("../data/raw/test_agg.pkl", compression="gzip")

In [12]:
train.head()

Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.933824,0.024194,0.86858,0.960384,0.934745,0.230769,0.83205,0,3,0,...,13,0,1,13,-1,1,13,6,1,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.89982,0.022119,0.861109,0.929122,0.880519,7.153846,6.743468,0,19,6,...,13,0,1,13,-1,1,13,6,1,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.878454,0.028911,0.79767,0.904482,0.880875,0.0,0.0,0,0,0,...,13,2,1,13,-1,1,13,6,1,0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.598969,0.020107,0.567442,0.623392,0.621776,1.538462,3.017046,0,9,0,...,13,0,1,13,-1,1,13,3,3,0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.891679,0.042325,0.805045,0.940382,0.8719,0.0,0.0,0,0,0,...,13,0,1,13,1,1,13,6,1,0
