In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('data/train_prepared_basic.csv')
test_data = pd.read_csv('data/test_prepared_basic.csv')
train_data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Id,Week,V1,V2,V3,V4,V9,V11,...,V19,V20,V22,P8,P9,P10,P11,P12,P23,target
0,0,0,-6536978109522202983,0,-1.214669,-1.229582,-1.304097,-0.384279,1.61884,1.689373,...,-0.610111,-1.62784,-0.607143,-0.488522,-0.364388,0.614911,-1.841831,-1.681824,-0.402846,0.0
1,1,1,-6536978109522202983,1,-1.214669,-1.229582,-1.304097,-0.384279,1.61884,1.689373,...,-0.610111,-1.62784,-0.607143,-0.488522,-0.364388,0.233876,-0.237328,-1.681824,-0.402846,0.0
2,2,2,-6536978109522202983,2,-1.214669,-1.229582,-1.304097,-0.384279,1.61884,1.689373,...,-0.610111,-1.62784,-0.607143,-0.488522,-0.364388,0.760227,0.614045,-1.681824,-0.402846,0.0
3,3,3,-6536978109522202983,3,-1.214669,-1.229582,-1.304097,-0.384279,1.61884,1.689373,...,-0.610111,-1.62784,-0.607143,1.025732,2.235298,-0.062077,-0.499621,-1.878605,2.481332,0.0
4,4,4,-1744017237843019509,0,0.261222,0.15075,-0.159125,-0.067517,2.039992,-0.419042,...,-0.610111,-0.837594,-0.607143,-0.488522,-0.364388,0.468424,-0.025675,-1.381967,2.365924,0.0


In [3]:
train_data = train_data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [4]:
test_data = test_data.drop(['Unnamed: 0'], axis=1)

In [5]:
def id_group_transform(data):
    # res = data.groupby(["Id"]).nth(0)
    res1 = data[data["Week"] == 0].copy()
    res1.drop(columns=(column_name for column_name in data.columns if column_name.startswith("P")), inplace=True)

    new_p_tables = []
    for week in range(4):
        temp = data[data["Week"] == week].copy()
        temp.drop(columns=(column_name for column_name in data.columns if not column_name.startswith("P") and column_name != "Id"), inplace=True)
        temp.rename(lambda name: name+f"_W{week}" if name.startswith("P") else name, axis="columns", inplace=True)
        new_p_tables.append(temp)


    res = res1.merge(new_p_tables[0], how="inner", on="Id")
    res = res.merge(new_p_tables[1], how="inner", on="Id")
    res = res.merge(new_p_tables[2], how="inner", on="Id")
    res = res.merge(new_p_tables[3], how="inner", on="Id")
    return res

In [6]:
X_train = id_group_transform(train_data)
X_test = id_group_transform(test_data)
y_train = X_train['target']
X_train = X_train.drop(['target', 'Week'], axis=1)
X_test = X_test.drop(['Week'], axis=1)
X_train.head()

Unnamed: 0,Id,V1,V2,V3,V4,V9,V11,V14,V15,V16,...,P24_W3,P25_W3,P26_W3,P27_W3,P8_W3,P9_W3,P10_W3,P11_W3,P12_W3,P23_W3
0,-6536978109522202983,-1.214669,-1.229582,-1.304097,-0.384279,1.61884,1.689373,0.844507,0.540733,1.68224,...,-1.830169,0.039483,0.720327,0.387509,1.025732,2.235298,-0.062077,-0.499621,-1.878605,2.481332
1,-1744017237843019509,0.261222,0.15075,-0.159125,-0.067517,2.039992,-0.419042,-1.029798,-1.380197,0.948993,...,-0.962976,0.698748,1.13325,1.017659,-0.488522,-0.364388,-0.039009,-0.114774,-0.926357,2.597932
2,-9187108666132882725,1.466966,1.713691,-1.304097,-0.946387,0.885337,-0.962336,-0.303839,-0.937923,-0.184209,...,1.081511,0.977721,-0.220856,-0.47012,-0.488522,-0.364388,0.11452,-0.783241,1.141776,-0.402846
3,-1259418257712246678,0.732093,0.632798,-0.67816,0.855476,0.436725,0.082047,-0.02103,-0.679664,-1.148976,...,1.454927,-1.721148,0.666257,1.265662,-0.488522,-0.364388,-1.021933,-0.063885,1.512694,-0.402846
4,7132677487852039192,-1.214669,-1.229582,-1.304097,-1.354,0.750737,1.297364,-1.029798,-1.380197,1.976676,...,-0.099432,0.880689,-1.221282,-1.96596,-0.488522,-0.364388,-1.257932,-1.513413,-0.014069,-0.402846


In [7]:
X_train = X_train.set_index('Id')
X_test = X_test.set_index('Id')

In [8]:
def generate_deltas(df):
    for i in range(27):
        for week in range(3):
            df[f'P{i+1}_W{week+1}-{week}'] = df[f'P{i+1}_W{week+1}'] - df[f'P{i+1}_W{week}']
        df[f'P{i+1}_W3-0'] = df[f'P{i+1}_W3'] - df[f'P{i+1}_W0']
    return df

In [9]:
X_train = generate_deltas(X_train)
X_test = generate_deltas(X_test)
X_train.head()

Unnamed: 0_level_0,V1,V2,V3,V4,V9,V11,V14,V15,V16,V17,...,P25_W3-2,P25_W3-0,P26_W1-0,P26_W2-1,P26_W3-2,P26_W3-0,P27_W1-0,P27_W2-1,P27_W3-2,P27_W3-0
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-6536978109522202983,-1.214669,-1.229582,-1.304097,-0.384279,1.61884,1.689373,0.844507,0.540733,1.68224,0.014685,...,-0.139155,0.323217,-0.399568,-1.078469,0.851133,-0.626903,0.202123,-0.837158,0.848628,0.213592
-1744017237843019509,0.261222,0.15075,-0.159125,-0.067517,2.039992,-0.419042,-1.029798,-1.380197,0.948993,1.2502,...,-0.131692,0.093221,0.63513,0.291939,1.012301,1.93937,0.565416,0.726801,-0.07708,1.215138
-9187108666132882725,1.466966,1.713691,-1.304097,-0.946387,0.885337,-0.962336,-0.303839,-0.937923,-0.184209,-0.481364,...,0.58306,0.019954,-0.27679,0.361082,0.56052,0.644812,-0.222499,0.425951,-0.212453,-0.009001
-1259418257712246678,0.732093,0.632798,-0.67816,0.855476,0.436725,0.082047,-0.02103,-0.679664,-1.148976,1.673578,...,0.0,0.0,-0.153241,0.589407,0.293758,0.729925,0.053212,0.09098,0.200677,0.344869
7132677487852039192,-1.214669,-1.229582,-1.304097,-1.354,0.750737,1.297364,-1.029798,-1.380197,1.976676,-1.193582,...,-0.066926,0.030087,0.310174,0.25277,-0.732799,-0.169856,-0.30738,0.0,0.0,-0.30738


In [10]:
def generate_divs(df):
    for i in range(27):
        for week in range(3):
            df[f'P{i+1}_W{week+1}/{week}'] = (df[f'P{i+1}_W{week+1}'] - df[f'P{i+1}_W{week+1}'].min() + 1) / (df[f'P{i+1}_W{week}'] - df[f'P{i+1}_W{week}'].min() + 1)
        df[f'P{i+1}_W3/0'] = (df[f'P{i+1}_W3'] - df[f'P{i+1}_W3'].min() + 1) / (df[f'P{i+1}_W0'] - df[f'P{i+1}_W0'].min() + 1)
    return df

In [11]:
X_train = generate_divs(X_train)
X_test = generate_divs(X_test)
X_train

Unnamed: 0_level_0,V1,V2,V3,V4,V9,V11,V14,V15,V16,V17,...,P25_W3/2,P25_W3/0,P26_W1/0,P26_W2/1,P26_W3/2,P26_W3/0,P27_W1/0,P27_W2/1,P27_W3/2,P27_W3/0
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-6536978109522202983,-1.214669,-1.229582,-1.304097,-0.384279,1.618840,1.689373,0.844507,0.540733,1.682240,0.014685,...,0.952012,1.132607,0.923548,0.776568,1.227068,0.880051,1.064373,0.810207,1.238488,1.068026
-1744017237843019509,0.261222,0.150750,-0.159125,-0.067517,2.039992,-0.419042,-1.029798,-1.380197,0.948993,1.250200,...,0.962920,1.028022,1.206677,1.078729,1.253068,1.631090,1.204233,1.278854,0.934339,1.438919
-9187108666132882725,1.466966,1.713691,-1.304097,-0.946387,0.885337,-0.962336,-0.303839,-0.937923,-0.184209,-0.481364,...,1.187130,1.005424,0.908150,1.131940,1.180942,1.213974,0.911172,1.275516,0.857334,0.996406
-1259418257712246678,0.732093,0.632798,-0.678160,0.855476,0.436725,0.082047,-0.021030,-0.679664,-1.148976,1.673578,...,1.000000,1.000000,0.959837,1.160941,1.069093,1.191305,1.013691,1.074582,0.999482,1.088729
7132677487852039192,-1.214669,-1.229582,-1.304097,-1.354000,0.750737,1.297364,-1.029798,-1.380197,1.976676,-1.193582,...,0.981758,1.008424,1.109690,1.080553,0.783879,0.939932,0.764889,1.202870,0.831345,0.764889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1725230195494199693,1.484947,0.632798,0.847232,0.784340,-1.137487,0.267028,-1.029798,0.212677,-0.551541,0.355025,...,1.000000,1.000000,0.943814,1.040430,0.793157,0.778859,1.043927,0.928384,1.232346,1.194347
-3951248822562299280,-0.154476,0.150750,-0.639985,-0.800899,-0.610984,-0.135951,-0.019277,-0.064148,-0.196778,-0.277898,...,1.341894,0.998789,1.302065,1.271565,0.744624,1.232844,0.614939,3.196576,0.511961,1.006362
5107134433121260865,-0.154476,0.150750,1.328129,-0.126426,1.131934,0.566143,-1.029798,-0.240007,0.327260,0.304397,...,1.062363,1.100758,1.052223,1.104892,1.112482,1.293364,1.098864,1.029986,1.163398,1.316750
2010738301694279951,0.768499,0.632798,-0.639233,1.893963,-0.420111,1.577818,0.237095,-0.277931,1.462554,-1.193582,...,0.936976,0.878815,1.242141,0.905193,0.945741,1.063370,1.347877,1.023204,1.105656,1.524868


In [None]:
df_train = X_train.copy()
df_train['target'] = y_train
df_train.reset_index(drop=True, inplace=True)
df_train.to_csv('data/train_prepared_oneliner_w_deltas_n_divs.csv')
X_test.to_csv('data/test_prepared_oneliner_w_deltas_n_divs.csv')