In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

# Plot libraries
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

### Import the dataframes

#### Train and test

In [2]:
# Train and test
train_df = pd.read_csv("(1)train.csv")
test_df = pd.read_csv("(1)test.csv")
train_df.shape, test_df.shape

((201917, 6), (123623, 5))

In [3]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749


#### Transactions

In [4]:
transactions_df = pd.read_csv("(1)transactions.csv")
transactions_df.shape

(30910695, 19)

In [5]:
transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,day_of_week,day_of_month,week_of_year,month,weekend
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37,6,25,25,6,1
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16,5,15,28,7,1
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37,2,9,32,8,0
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34,5,2,35,9,1
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37,4,10,10,3,0


#### Merchants

In [6]:
merchants_df = pd.read_csv("(1)merchants.csv")
merchants_df.shape

(334620, 22)

In [7]:
merchants_df.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,False,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,False,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,False,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,False,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,False,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,False,-1,5,5.0
3,M_ID_c929bb59af,9514,2,20,-0.057471,-0.057471,False,E,E,69667.0,...,69667.0,1.0,3,69667.0,1.0,3,False,-1,20,5.0
4,M_ID_dd3ae3de10,30534,278,37,-0.057471,-0.057471,False,E,E,5180.0,...,9830.0,4.166667,6,15826.0,5.6,10,True,11,23,2.0


In [8]:
"""#substitute inf values with 2*max_value_in_column
for i in ['3', '6', '12']:
    max_val = max(merchants_df[merchants_df != np.inf]['avg_purchases_lag' + i])
    merchants_df['avg_purchases_lag' + i].replace([np.inf, -np.inf], 2*max_val)"""

"#substitute inf values with 2*max_value_in_column\nfor i in ['3', '6', '12']:\n    max_val = max(merchants_df[merchants_df != np.inf]['avg_purchases_lag' + i])\n    merchants_df['avg_purchases_lag' + i].replace([np.inf, -np.inf], 2*max_val)"

### Transactions and Merchants dataframes merging

In [9]:
print("Number of common merchant_id in merchants_df and transactions_df: %d" % (len(set(transactions_df.merchant_id.unique()).intersection(set(merchants_df.merchant_id.unique())))))
print("merchants_df samples: %d, transactions_df samples: %d" % (merchants_df.shape[0], transactions_df.shape[0]))

Number of common merchant_id in merchants_df and transactions_df: 334620
merchants_df samples: 334620, transactions_df samples: 30910695


In [10]:
for c in merchants_df.columns:
    if c not in transactions_df.columns:
        print(c)

merchant_group_id
numerical_1
numerical_2
most_recent_sales_range
most_recent_purchases_range
avg_sales_lag3
avg_purchases_lag3
active_months_lag3
avg_sales_lag6
avg_purchases_lag6
active_months_lag6
avg_sales_lag12
avg_purchases_lag12
active_months_lag12
category_4


Actually, the only meaningfull features to be merged are merchant_group_id, most_recent_sales_range and most_recent_purchases_range

In [11]:
# here we can include more features

In [12]:
merchants_df = merchants_df[['merchant_id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]]
merchants_df.columns = ['id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]
merchants_df.head()

Unnamed: 0,id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range
0,M_ID_838061e48c,8353,E,E
1,M_ID_9339d880ad,3184,E,E
2,M_ID_e726bbae1e,447,E,E
3,M_ID_c929bb59af,9514,E,E
4,M_ID_dd3ae3de10,30534,E,E


In [13]:
trans_merch_df = transactions_df.merge(merchants_df, how='left', left_on='merchant_id', right_on='id')
trans_merch_df = trans_merch_df.drop(['id'], axis=1)
trans_merch_df.shape

(30910695, 22)

In [14]:
trans_merch_df.head(10)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,state_id,subsector_id,day_of_week,day_of_month,week_of_year,month,weekend,merchant_group_id,most_recent_sales_range,most_recent_purchases_range
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,...,16,37,6,25,25,6,1,35.0,A,A
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,...,16,16,5,15,28,7,1,2084.0,A,A
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,...,16,37,2,9,32,8,0,27369.0,C,C
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,...,16,34,5,2,35,9,1,24104.0,D,C
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,...,16,37,4,10,10,3,0,35.0,A,A
5,True,C_ID_4e6213e9bc,333,False,0,A,80,M_ID_50af771f8d,0,-0.734887,...,9,37,5,24,8,2,1,35.0,B,A
6,True,C_ID_4e6213e9bc,88,False,0,A,278,M_ID_5e8220e564,-11,-0.716855,...,16,37,1,21,12,3,0,35.0,A,A
7,True,C_ID_4e6213e9bc,3,False,0,A,80,M_ID_9d41786a50,-3,-0.657049,...,16,37,5,18,46,11,1,1348.0,C,C
8,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-8,-0.737967,...,16,37,3,1,22,6,0,27369.0,C,C
9,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_74ba14b5fc,-11,-0.715352,...,16,37,3,16,11,3,0,16731.0,D,C


In [15]:
trans_merch_df.isnull().sum()

authorized_flag                   0
card_id                           0
city_id                           0
category_1                        0
installments                      0
category_3                        0
merchant_category_id              0
merchant_id                       0
month_lag                         0
purchase_amount                   0
purchase_date                     0
category_2                        0
state_id                          0
subsector_id                      0
day_of_week                       0
day_of_month                      0
week_of_year                      0
month                             0
weekend                           0
merchant_group_id              9401
most_recent_sales_range        9401
most_recent_purchases_range    9401
dtype: int64

In [16]:
trans_merch_df.dropna(inplace=True)
trans_merch_df.shape

(30901294, 22)

In [17]:
%%time
# This function can be used to compute the mode with groupby()
# prova = trans_merch_df.groupby('card_id')['city_id'].apply(lambda x: x.mode())

Wall time: 0 ns


## Merge transactions in train and test

In [18]:
print("Number of common card_id in train_df and trans_merch_df: %d" % (len(set(trans_merch_df.card_id.unique()).intersection(set(train_df.card_id.unique())))))
print("train_df samples: %d, trans_merch_df samples: %d" % (train_df.shape[0], trans_merch_df.shape[0]))

print("Number of common card_id in test_df and trans_merch_df: %d" % (len(set(trans_merch_df.card_id.unique()).intersection(set(test_df.card_id.unique())))))
print("test_df samples: %d, trans_merch_df samples: %d" % (test_df.shape[0], trans_merch_df.shape[0]))

Number of common card_id in train_df and trans_merch_df: 201917
train_df samples: 201917, trans_merch_df samples: 30901294
Number of common card_id in test_df and trans_merch_df: 123623
test_df samples: 123623, trans_merch_df samples: 30901294


In [19]:
def transaction_merchant_card_id_group(df):
    
    new_df = pd.DataFrame()
    
    # card_id
    new_df['transactions_count'] = df.groupby("card_id")['card_id'].count()
    
    # authorized_flag
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # city_id
    new_df['city_id_nunique'] = df.groupby('card_id')['city_id'].nunique()
    # category_1
    new_df['category_1_sum'] = df.groupby("card_id")['category_1'].sum()
    new_df['category_1_mean'] = df.groupby("card_id")['category_1'].mean()
    # installments
    new_df['installments_nunique'] = df.groupby('card_id')['installments'].nunique()
    new_df['installments_sum'] = df.groupby('card_id')['installments'].sum()
    new_df['installments_mean'] = df.groupby('card_id')['installments'].mean()
    new_df['installments_max'] = df.groupby('card_id')['installments'].max()
    new_df['installments_min'] = df.groupby('card_id')['installments'].min()
    # category_3
    new_df['category_3_nunique'] = df.groupby('card_id')['category_3'].nunique()
    # merchant_category_id
    new_df['merchant_category_id_nunique'] = df.groupby('card_id')['merchant_category_id'].nunique()    
    # merchant_id
    new_df['merchant_id_nunique'] = df.groupby('card_id')['merchant_id'].nunique()    
    # month_lag
    new_df['month_lag_nunique'] = df.groupby('card_id')['month_lag'].nunique()
    new_df['month_lag_sum'] = df.groupby('card_id')['month_lag'].sum()
    new_df['month_lag_mean'] = df.groupby('card_id')['month_lag'].mean()
    new_df['month_lag_max'] = df.groupby('card_id')['month_lag'].max()
    new_df['month_lag_min'] = df.groupby('card_id')['month_lag'].min()
    # purchase_amount
    new_df['month_lag_nunique'] = df.groupby('card_id')['month_lag'].nunique()
    new_df['month_lag_sum'] = df.groupby('card_id')['month_lag'].sum()
    new_df['month_lag_mean'] = df.groupby('card_id')['month_lag'].mean()
    new_df['month_lag_max'] = df.groupby('card_id')['month_lag'].max()
    new_df['month_lag_min'] = df.groupby('card_id')['month_lag'].min()
    # purchase_date
    # NOTE: This must be replaced with datetime time steps
    new_df['purchase_date_nunique'] = df.groupby('card_id')['purchase_date'].nunique()
    # category_2
    new_df['category_2_nunique'] = df.groupby('card_id')['category_2'].nunique()
    # state_id
    new_df['state_id_nunique'] = df.groupby('card_id')['state_id'].nunique()
    # subsector_id
    new_df['subsector_id_nunique'] = df.groupby('card_id')['subsector_id'].nunique()
    # merchant_group_id
    new_df['merchant_group_id_nunique'] = df.groupby('card_id')['merchant_group_id'].nunique()
    # most_recent_sales_range
    new_df['most_recent_sales_range_nunique'] = df.groupby('card_id')['most_recent_sales_range'].nunique()
    # most_recent_purchases_range
    new_df['most_recent_purchases_range_nunique'] = df.groupby('card_id')['most_recent_purchases_range'].nunique()
    
    new_df['trans_card_id'] = new_df.index
    new_df = new_df.reset_index(drop=True)
    return new_df

In [20]:
grouped_df = transaction_merchant_card_id_group(trans_merch_df)
grouped_df.head()

Unnamed: 0,transactions_count,authorized_transactions_sum,authorized_transactions_mean,city_id_nunique,category_1_sum,category_1_mean,installments_nunique,installments_sum,installments_mean,installments_max,...,month_lag_max,month_lag_min,purchase_date_nunique,category_2_nunique,state_id_nunique,subsector_id_nunique,merchant_group_id_nunique,most_recent_sales_range_nunique,most_recent_purchases_range_nunique,trans_card_id
0,151,116.0,0.768212,5,28.0,0.18543,5,194,1.284768,6,...,2,-12,83,4,4,13,15,5,5,C_ID_00007093c1
1,148,145.0,0.97973,19,4.0,0.027027,9,239,1.614865,10,...,2,-5,87,3,6,19,66,5,5,C_ID_0001238066
2,67,63.0,0.940299,3,0.0,0.0,2,1,0.014925,1,...,1,-13,45,2,2,12,19,5,5,C_ID_0001506ef0
3,247,220.0,0.890688,11,2.0,0.008097,2,5,0.020243,1,...,2,-9,98,5,5,25,104,5,5,C_ID_0001793786
4,155,148.0,0.954839,10,4.0,0.025806,8,280,1.806452,10,...,2,-6,87,5,7,21,64,5,5,C_ID_000183fdda


#### Train

In [21]:
train_small_df = train_df.merge(grouped_df, how='left', left_on='card_id', right_on='trans_card_id')
train_small_df = train_small_df.drop(['trans_card_id'], axis=1)
train_small_df.shape

(201917, 32)

In [22]:
train_small_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,transactions_count,authorized_transactions_sum,authorized_transactions_mean,city_id_nunique,...,month_lag_mean,month_lag_max,month_lag_min,purchase_date_nunique,category_2_nunique,state_id_nunique,subsector_id_nunique,merchant_group_id_nunique,most_recent_sales_range_nunique,most_recent_purchases_range_nunique
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,277,265.0,0.956679,9,...,-3.530686,2,-8,170,2,3,21,88,5,5
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,356,345.0,0.969101,9,...,-4.921348,2,-12,207,2,3,24,107,5,5
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,44,42.0,0.954545,5,...,-8.363636,2,-13,35,2,2,8,10,4,5
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,84,84.0,1.0,7,...,-2.452381,2,-5,54,3,5,15,43,5,5
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,168,163.0,0.970238,7,...,-0.678571,2,-3,104,5,7,18,80,5,5


In [23]:
train_small_df.isnull().sum()

first_active_month                     0
card_id                                0
feature_1                              0
feature_2                              0
feature_3                              0
target                                 0
transactions_count                     0
authorized_transactions_sum            0
authorized_transactions_mean           0
city_id_nunique                        0
category_1_sum                         0
category_1_mean                        0
installments_nunique                   0
installments_sum                       0
installments_mean                      0
installments_max                       0
installments_min                       0
category_3_nunique                     0
merchant_category_id_nunique           0
merchant_id_nunique                    0
month_lag_nunique                      0
month_lag_sum                          0
month_lag_mean                         0
month_lag_max                          0
month_lag_min   

#### Test

In [24]:
test_small_df = test_df.merge(grouped_df, how='left', left_on='card_id', right_on='trans_card_id')
test_small_df = test_small_df.drop(['trans_card_id'], axis=1)
test_small_df.shape

(123623, 31)

In [25]:
test_small_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,transactions_count,authorized_transactions_sum,authorized_transactions_mean,city_id_nunique,category_1_sum,...,month_lag_mean,month_lag_max,month_lag_min,purchase_date_nunique,category_2_nunique,state_id_nunique,subsector_id_nunique,merchant_group_id_nunique,most_recent_sales_range_nunique,most_recent_purchases_range_nunique
0,2017-04-01,C_ID_0ab67a22ab,3,3,1,71,47.0,0.661972,7,23.0,...,-3.394366,2,-8,43,2,3,13,15,5,5
1,2017-01-01,C_ID_130fd0cbdd,2,3,0,87,86.0,0.988506,4,4.0,...,-9.183908,2,-13,66,3,3,15,26,5,5
2,2017-08-01,C_ID_b709037bc5,5,1,1,15,11.0,0.733333,4,2.0,...,-1.666667,1,-6,9,3,5,7,10,4,3
3,2017-12-01,C_ID_d27d835a9f,2,1,0,36,36.0,1.0,3,1.0,...,-0.527778,2,-2,21,3,3,14,27,4,4
4,2015-12-01,C_ID_2b5e3df5c2,5,1,1,116,93.0,0.801724,5,0.0,...,-5.844828,2,-13,75,3,4,15,38,5,5


In [26]:
test_small_df.isnull().sum()

first_active_month                     0
card_id                                0
feature_1                              0
feature_2                              0
feature_3                              0
transactions_count                     0
authorized_transactions_sum            0
authorized_transactions_mean           0
city_id_nunique                        0
category_1_sum                         0
category_1_mean                        0
installments_nunique                   0
installments_sum                       0
installments_mean                      0
installments_max                       0
installments_min                       0
category_3_nunique                     0
merchant_category_id_nunique           0
merchant_id_nunique                    0
month_lag_nunique                      0
month_lag_sum                          0
month_lag_mean                         0
month_lag_max                          0
month_lag_min                          0
purchase_date_nu

### Export small dataframes

In [27]:
train_small_df.to_csv('(2)small_train.csv', index = False)
test_small_df.to_csv('(2)small_test.csv', index = False)