In [98]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings
import time
import sys
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge
warnings.simplefilter(action='ignore', category=FutureWarning)
import gc

In [99]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [100]:
new_transactions = pd.read_csv('new_merchant_transactions.csv', parse_dates=['purchase_date'])
historical_transactions = pd.read_csv('historical_transactions.csv', parse_dates=['purchase_date'])

def binarize(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    return df

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

In [101]:
def read_data(input_file):
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
    return df

train = read_data('train.csv')
test = read_data('test.csv')

target = train['target']
del train['target']
gc.collect()

779

In [102]:
new_transactions = pd.read_csv('new_merchant_transactions.csv', parse_dates=['purchase_date'])
historical_transactions = pd.read_csv('historical_transactions.csv', parse_dates=['purchase_date'])

def binarize(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    return df

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

In [103]:
historical_transactions = pd.get_dummies(historical_transactions, columns=['category_2', 'category_3'])
new_transactions = pd.get_dummies(new_transactions, columns=['category_2', 'category_3'])

historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

agg_fun = {'authorized_flag': ['sum', 'mean']}
auth_mean = historical_transactions.groupby(['card_id']).agg(agg_fun)
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)

authorized_transactions = historical_transactions[historical_transactions['authorized_flag'] == 1]
historical_transactions = historical_transactions[historical_transactions['authorized_flag'] == 0]
gc.collect()

Mem. usage decreased to 1304.89 Mb (54.8% reduction)
Mem. usage decreased to 84.24 Mb (56.7% reduction)


50

In [104]:
historical_transactions['purchase_month'] = historical_transactions['purchase_date'].dt.month
authorized_transactions['purchase_month'] = authorized_transactions['purchase_date'].dt.month
new_transactions['purchase_month'] = new_transactions['purchase_date'].dt.month
gc.collect()

19

In [165]:
agg_fun = {'state_id': ['count']}
hist=historical_transactions.groupby(['card_id','state_id'])['state_id'].agg(agg_fun)
hist.columns = ['_'.join(col).strip() for col in hist.columns.values]

In [174]:
hist.reset_index(inplace=True)

In [163]:
hist=pd.read_csv('city_id/hist.csv')

In [168]:
hist.head()

Unnamed: 0,card_id,state_id,state_id_count
0,C_ID_00007093c1,-1,4
1,C_ID_00007093c1,2,31
2,C_ID_0001238066,9,1
3,C_ID_0001238066,20,2
4,C_ID_0001506ef0,19,4


In [169]:
agg_func = {'state_id_count': ['max','min','mean']}

In [170]:
hist=hist.groupby(['card_id'])['state_id_count'].agg(agg_func)

In [172]:
hist.columns = ['_'.join(col).strip() for col in hist.columns.values]

In [178]:
hist.head()

Unnamed: 0,card_id,hist_state_id_count_max,hist_state_id_count_min,hist_state_id_count_mean
0,C_ID_00007093c1,31,4,17.5
1,C_ID_0001238066,2,1,1.5
2,C_ID_0001506ef0,4,4,4.0
3,C_ID_0001793786,13,3,6.75
4,C_ID_000183fdda,7,7,7.0


In [177]:
hist.columns = ['hist_' + c if c != 'card_id' else c for c in hist.columns]

In [181]:
agg_fun = {'city_id': ['count']}
hist_city=historical_transactions.groupby(['card_id','city_id'])['city_id'].agg(agg_fun)
hist_city.columns = ['_'.join(col).strip() for col in hist_city.columns.values]

In [182]:
hist_city.reset_index(inplace=True)

In [183]:
hist_city.head()

Unnamed: 0,card_id,city_id,city_id_count
0,C_ID_00007093c1,-1,4
1,C_ID_00007093c1,76,5
2,C_ID_00007093c1,244,26
3,C_ID_0001238066,149,2
4,C_ID_0001238066,213,1


In [184]:
agg_func = {'city_id_count': ['max','min','mean']}

In [185]:
hist_city=hist_city.groupby(['card_id'])['city_id_count'].agg(agg_func)

In [186]:
hist_city.columns = ['_'.join(col).strip() for col in hist_city.columns.values]

In [187]:
hist_city.reset_index(inplace=True)

In [188]:
hist_city.columns = ['hist_' + c if c != 'card_id' else c for c in hist_city.columns]

In [228]:
hist_city.head()

Unnamed: 0,card_id,hist_city_id_count_max,hist_city_id_count_min,hist_city_id_count_mean
0,C_ID_00007093c1,26,4,11.666667
1,C_ID_0001238066,2,1,1.5
2,C_ID_0001506ef0,4,4,4.0
3,C_ID_0001793786,11,1,3.375
4,C_ID_000183fdda,7,7,7.0


In [190]:
hist.to_csv('city_id/hist.csv',index=False)

In [191]:
hist_city.to_csv('city_id/hist_city.csv',index=False)

In [246]:
agg_fun = {'state_id': ['count']}
auth=authorized_transactions.groupby(['card_id','state_id'])['state_id'].agg(agg_fun)
auth.columns = ['_'.join(col).strip() for col in auth.columns.values]

In [247]:
auth.reset_index(inplace=True)

In [248]:
agg_func = {'state_id_count': ['max','min','mean']}

In [249]:
auth=auth.groupby(['card_id'])['state_id_count'].agg(agg_func)

In [250]:
auth.reset_index(inplace=True)

In [252]:
auth.columns = ['_'.join(col).strip() for col in auth.columns.values]

In [256]:
auth.head()

Unnamed: 0,card_id,auth_state_id_count_max,auth_state_id_count_min,auth_state_id_count_mean
0,C_ID_00007093c1,89,1,38.0
1,C_ID_0001238066,92,1,20.0
2,C_ID_0001506ef0,60,2,31.0
3,C_ID_0001793786,101,8,47.25
4,C_ID_000183fdda,121,1,19.571429


In [254]:
auth.rename(columns={'card_id_':'card_id'},inplace=True)

In [255]:
auth.columns = ['auth_' + c if c != 'card_id' else c for c in auth.columns]

In [257]:
agg_fun = {'city_id': ['count']}
auth_city=authorized_transactions.groupby(['card_id','city_id'])['city_id'].agg(agg_fun)
auth_city.columns = ['_'.join(col).strip() for col in auth_city.columns.values]

In [258]:
auth_city.reset_index(inplace=True)

In [259]:
agg_func = {'city_id_count': ['max','min','mean']}

In [260]:
auth_city=auth_city.groupby(['card_id'])['city_id_count'].agg(agg_func)

In [261]:
auth_city.reset_index(inplace=True)

In [262]:
auth_city.columns = ['_'.join(col).strip() for col in auth_city.columns.values]

In [280]:
auth_city.head()

Unnamed: 0,card_id,auth_city_id_count_max,auth_city_id_count_min,auth_city_id_count_mean
0,C_ID_00007093c1,74,1,28.5
1,C_ID_0001238066,40,1,6.666667
2,C_ID_0001506ef0,59,1,20.666667
3,C_ID_0001793786,101,4,21.0
4,C_ID_000183fdda,116,1,15.222222


In [264]:
auth_city.rename(columns={'card_id_':'card_id'},inplace=True)

In [266]:
auth_city.columns = ['auth_' + c if c != 'card_id' else c for c in auth_city.columns]

In [268]:
auth.to_csv('city_id/auth.csv',index=False)

In [269]:
auth_city.to_csv('city_id/auth_city.csv',index=False)

In [298]:
agg_fun = {'state_id': ['count']}
new=new_transactions.groupby(['card_id','state_id'])['state_id'].agg(agg_fun)
new.columns = ['_'.join(col).strip() for col in new.columns.values]

In [299]:
new.reset_index(inplace=True)

In [300]:
agg_func = {'state_id_count': ['max','min','mean']}

In [301]:
new=new.groupby(['card_id'])['state_id_count'].agg(agg_func)

In [159]:
hist.head()

Unnamed: 0_level_0,state_id_count,state_id_count,state_id_count
Unnamed: 0_level_1,max,min,mean
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
C_ID_00007093c1,31,4,17.5
C_ID_0001238066,2,1,1.5
C_ID_0001506ef0,4,4,4.0
C_ID_0001793786,13,3,6.75
C_ID_000183fdda,7,7,7.0


In [302]:
new.reset_index(inplace=True)

In [153]:
hist.head()

Unnamed: 0_level_0,state_id_count_max,state_id_count_min,state_id_count_mean
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C_ID_00007093c1,31,4,17.5
C_ID_0001238066,2,1,1.5
C_ID_0001506ef0,4,4,4.0
C_ID_0001793786,13,3,6.75
C_ID_000183fdda,7,7,7.0


In [306]:
new.head()

Unnamed: 0,card_id,state_id_count_max,state_id_count_min,state_id_count_mean
0,C_ID_00007093c1,1,1,1.0
1,C_ID_0001238066,19,1,6.5
2,C_ID_0001506ef0,2,2,2.0
3,C_ID_0001793786,15,1,6.2
4,C_ID_000183fdda,10,1,5.5


In [304]:
new.columns = ['_'.join(col).strip() for col in new.columns.values]

In [305]:
new.rename(columns={'card_id_':'card_id'},inplace=True)

In [307]:
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]

In [308]:
new.head()

Unnamed: 0,card_id,new_state_id_count_max,new_state_id_count_min,new_state_id_count_mean
0,C_ID_00007093c1,1,1,1.0
1,C_ID_0001238066,19,1,6.5
2,C_ID_0001506ef0,2,2,2.0
3,C_ID_0001793786,15,1,6.2
4,C_ID_000183fdda,10,1,5.5


In [284]:
agg_fun = {'city_id': ['count']}
new_city=authorized_transactions.groupby(['card_id','city_id'])['city_id'].agg(agg_fun)
new_city.columns = ['_'.join(col).strip() for col in new_city.columns.values]

In [285]:
new_city.reset_index(inplace=True)

In [286]:
agg_func = {'city_id_count': ['max','min','mean']}

In [287]:
new_city=new_city.groupby(['card_id'])['city_id_count'].agg(agg_func)

In [288]:
new_city.reset_index(inplace=True)

In [289]:
new_city.columns = ['_'.join(col).strip() for col in new_city.columns.values]

In [294]:
new_city.rename(columns={'card_id_':'card_id'},inplace=True)

In [296]:
new_city.columns = ['new_' + c if c != 'card_id' else c for c in new_city.columns]

In [310]:
new_city.head()

Unnamed: 0,card_id,new_city_id_count_max,new_city_id_count_min,new_city_id_count_mean
0,C_ID_00007093c1,74,1,28.5
1,C_ID_0001238066,40,1,6.666667
2,C_ID_0001506ef0,59,1,20.666667
3,C_ID_0001793786,101,4,21.0
4,C_ID_000183fdda,116,1,15.222222


In [311]:
new.to_csv('city_id/new.csv',index=False)
new_city.to_csv('city_id/new_city.csv',index=False)

In [312]:
new.head()

Unnamed: 0,card_id,new_state_id_count_max,new_state_id_count_min,new_state_id_count_mean
0,C_ID_00007093c1,1,1,1.0
1,C_ID_0001238066,19,1,6.5
2,C_ID_0001506ef0,2,2,2.0
3,C_ID_0001793786,15,1,6.2
4,C_ID_000183fdda,10,1,5.5


In [198]:
auth.head()

Unnamed: 0,card_id_,state_id_count_max,state_id_count_min,state_id_count_mean
0,C_ID_00007093c1,89,1,38.0
1,C_ID_0001238066,92,1,20.0
2,C_ID_0001506ef0,60,2,31.0
3,C_ID_0001793786,101,8,47.25
4,C_ID_000183fdda,121,1,19.571429


In [97]:
authorized_transactions

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,...,subsector_id,category_2_1.0,category_2_2.0,category_2_3.0,category_2_4.0,category_2_5.0,category_3_A,category_3_B,category_3_C,purchase_month
0,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,...,37,1,0,0,0,0,1,0,0,6
1,1,C_ID_4e6213e9bc,88,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,...,16,1,0,0,0,0,1,0,0,7
2,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,...,37,1,0,0,0,0,1,0,0,8
3,1,C_ID_4e6213e9bc,88,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,...,34,1,0,0,0,0,1,0,0,9
4,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,...,37,1,0,0,0,0,1,0,0,3
5,1,C_ID_4e6213e9bc,333,0,0,80,M_ID_50af771f8d,0,-0.734887,2018-02-24 08:45:05,...,37,1,0,0,0,0,1,0,0,2
6,1,C_ID_4e6213e9bc,88,0,0,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21 00:10:51,...,37,1,0,0,0,0,1,0,0,3
7,1,C_ID_4e6213e9bc,3,0,0,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18 20:05:55,...,37,1,0,0,0,0,1,0,0,11
8,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01 22:02:56,...,37,1,0,0,0,0,1,0,0,6
9,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16 15:41:22,...,37,1,0,0,0,0,1,0,0,3


In [136]:
new_transactions.columns

Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'merchant_category_id', 'merchant_id', 'month_lag', 'purchase_amount',
       'purchase_date', 'state_id', 'subsector_id', 'category_2_1.0',
       'category_2_2.0', 'category_2_3.0', 'category_2_4.0', 'category_2_5.0',
       'category_3_A', 'category_3_B', 'category_3_C', 'purchase_month'],
      dtype='object')

In [75]:
auth_city

Unnamed: 0,card_id_,city_id_count_max,city_id_count_min,city_id_count_mean
0,C_ID_00007093c1,89,1,38.000000
1,C_ID_0001238066,92,1,20.000000
2,C_ID_0001506ef0,60,2,31.000000
3,C_ID_0001793786,101,8,47.250000
4,C_ID_000183fdda,121,1,19.571429
5,C_ID_00024e244b,50,3,26.500000
6,C_ID_0002709b5a,49,1,16.750000
7,C_ID_00027503e2,22,2,12.000000
8,C_ID_000298032a,27,1,14.000000
9,C_ID_0002ba3c2e,54,1,27.500000


In [67]:
auth

Unnamed: 0,card_id_,state_id_count_max,state_id_count_min,state_id_count_mean
0,C_ID_00007093c1,89,1,38.000000
1,C_ID_0001238066,92,1,20.000000
2,C_ID_0001506ef0,60,2,31.000000
3,C_ID_0001793786,101,8,47.250000
4,C_ID_000183fdda,121,1,19.571429
5,C_ID_00024e244b,50,3,26.500000
6,C_ID_0002709b5a,49,1,16.750000
7,C_ID_00027503e2,22,2,12.000000
8,C_ID_000298032a,27,1,14.000000
9,C_ID_0002ba3c2e,54,1,27.500000


In [59]:
auth

Unnamed: 0_level_0,Unnamed: 1_level_0,state_id
Unnamed: 0_level_1,Unnamed: 1_level_1,count
card_id,state_id,Unnamed: 2_level_2
C_ID_00007093c1,-1,24
C_ID_00007093c1,2,89
C_ID_00007093c1,21,1
C_ID_0001238066,-1,8
C_ID_0001238066,5,1
C_ID_0001238066,9,92
C_ID_0001238066,15,2
C_ID_0001238066,20,16
C_ID_0001238066,21,1
C_ID_0001506ef0,9,2
