In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

# Plot libraries
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

### Import the dataframes

#### Train and test

In [2]:
# Train and test
train_df = pd.read_csv("(1)train.csv")
test_df = pd.read_csv("(1)test.csv")
train_df.shape, test_df.shape

((201917, 6), (123623, 5))

In [3]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749


#### Transactions

In [None]:
transactions_df = pd.read_csv("(1)transactions.csv")
transactions_df.shape

In [None]:
transactions_df.head()

#### Merchants

In [None]:
merchants_df = pd.read_csv("(1)merchants.csv")
merchants_df.shape

In [None]:
merchants_df.head()

In [8]:
"""#substitute inf values with 2*max_value_in_column
for i in ['3', '6', '12']:
    max_val = max(merchants_df[merchants_df != np.inf]['avg_purchases_lag' + i])
    merchants_df['avg_purchases_lag' + i].replace([np.inf, -np.inf], 2*max_val)"""

"#substitute inf values with 2*max_value_in_column\nfor i in ['3', '6', '12']:\n    max_val = max(merchants_df[merchants_df != np.inf]['avg_purchases_lag' + i])\n    merchants_df['avg_purchases_lag' + i].replace([np.inf, -np.inf], 2*max_val)"

### Transactions and Merchants dataframes merging

In [None]:
print("Number of common merchant_id in merchants_df and transactions_df: %d" % (len(set(transactions_df.merchant_id.unique()).intersection(set(merchants_df.merchant_id.unique())))))
print("merchants_df samples: %d, transactions_df samples: %d" % (merchants_df.shape[0], transactions_df.shape[0]))

In [None]:
for c in merchants_df.columns:
    if c not in transactions_df.columns:
        print(c)

Actually, the only meaningfull features to be merged are merchant_group_id, most_recent_sales_range and most_recent_purchases_range

In [11]:
# here we can include more features

In [None]:
merchants_df = merchants_df[['merchant_id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]]
merchants_df.columns = ['id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]
merchants_df.head()

In [None]:
trans_merch_df = transactions_df.merge(merchants_df, how='left', left_on='merchant_id', right_on='id')
trans_merch_df = trans_merch_df.drop(['id'], axis=1)
trans_merch_df.shape

In [24]:
trans_merch_df.head(10)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37,35.0,A,A
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16,2084.0,A,A
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37,27369.0,C,C
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34,24104.0,D,C
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37,35.0,A,A
5,True,C_ID_4e6213e9bc,333,False,0,A,80,M_ID_50af771f8d,0,-0.734887,2018-02-24,1.0,9,37,35.0,B,A
6,True,C_ID_4e6213e9bc,88,False,0,A,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21,1.0,16,37,35.0,A,A
7,True,C_ID_4e6213e9bc,3,False,0,A,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18,1.0,16,37,1348.0,C,C
8,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01,1.0,16,37,27369.0,C,C
9,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16,1.0,16,37,16731.0,D,C


In [15]:
trans_merch_df.isnull().sum()

authorized_flag                   0
card_id                           0
city_id                           0
category_1                        0
installments                      0
category_3                        0
merchant_category_id              0
merchant_id                       0
month_lag                         0
purchase_amount                   0
purchase_date                     0
category_2                        0
state_id                          0
subsector_id                      0
merchant_group_id              9401
most_recent_sales_range        9401
most_recent_purchases_range    9401
dtype: int64

In [16]:
trans_merch_df.dropna(inplace=True)
trans_merch_df.shape

(30901294, 17)

In [25]:
%%time
# This function can be used to compute the mode with groupby()
trans_merch_df.groupby('card_id')['city_id'].apply(lambda x: x.mode())

Wall time: 1min 5s


card_id           
C_ID_00007093c1  0    244
C_ID_0001238066  0    314
C_ID_0001506ef0  0    137
C_ID_0001793786  0    179
C_ID_000183fdda  0    161
C_ID_00024e244b  0    156
C_ID_0002709b5a  0    103
C_ID_00027503e2  0    146
C_ID_000298032a  0    233
C_ID_0002ba3c2e  0    344
C_ID_0002c7c2c1  0     69
C_ID_00032df08f  0     69
C_ID_0003754056  0    303
C_ID_000377f6a0  0     76
C_ID_0003be3c83  0    279
C_ID_0003f41435  0      8
C_ID_00042d509c  0     -1
C_ID_0004587331  0     87
C_ID_0004725b87  0    179
C_ID_0004888ddd  0    286
C_ID_0004b68c49  0     69
C_ID_0004c2a5ab  0     -1
C_ID_00057b99fe  0    209
C_ID_000599daf9  0    246
C_ID_0005b2f279  0     79
C_ID_0005b5804f  0     20
C_ID_0005f16cc8  0    244
C_ID_0006152db8  0    117
C_ID_000616f4a8  0    168
C_ID_000664aa02  0    314
                     ... 
C_ID_fffb79fb56  0     21
C_ID_fffb9ea3f6  0     76
C_ID_fffba72dc5  0     19
C_ID_fffbdf036b  0     69
C_ID_fffbee5c24  0      4
C_ID_fffc96bf24  0     17
C_ID_fffcb74f49  0 

In [18]:
temp = trans_merch_df.groupby("card_id")
gdf = transactions_df.groupby("card_id")
temp

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000002E0EA973CC0>

In [76]:
trans_merch_df.groupby("card_id")

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000267F8FA0F60>

In [73]:
prova

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000267F8FA0C18>

## Merge transactions in train and test

In [None]:
def transaction_merchant_card_id_group(df):
    
    # card_id
    df['transactions_count'] = df.groupby("card_id")['card_id'].count()
    
    # authorized_flag
    df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # city_id
    df['city_id_nunique'] = df.groupby('card_id')['city_id'].nunique()
    # category_1
    df['category_1_sum'] = df.groupby("card_id")['category_1'].sum()
    df['category_1_mean'] = df.groupby("card_id")['category_1'].mean()
    # installments
    df['installments_nunique'] = df.groupby('card_id')['installments'].nunique()
    df['installments_sum'] = df.groupby('card_id')['installments'].sum()
    df['installments_mean'] = df.groupby('card_id')['installments'].mean()
    df['installments_max'] = df.groupby('card_id')['installments'].max()
    df['installments_min'] = df.groupby('card_id')['installments'].min()
    # category_3
    df['category_3_nunique'] = df.groupby('card_id')['category_3'].nunique()
    # merchant_category_id
    df['merchant_category_id_nunique'] = df.groupby('card_id')['merchant_category_id'].nunique()    
    # merchant_id
    df['merchant_id_nunique'] = df.groupby('card_id')['merchant_id'].nunique()    
    # month_lag
    df['month_lag_nunique'] = df.groupby('card_id')['month_lag'].nunique()
    df['month_lag_sum'] = df.groupby('card_id')['month_lag'].sum()
    df['month_lag_mean'] = df.groupby('card_id')['month_lag'].mean()
    df['month_lag_max'] = df.groupby('card_id')['month_lag'].max()
    df['month_lag_min'] = df.groupby('card_id')['month_lag'].min()
    # purchase_amount
    df['month_lag_nunique'] = df.groupby('card_id')['month_lag'].nunique()
    df['month_lag_sum'] = df.groupby('card_id')['month_lag'].sum()
    df['month_lag_mean'] = df.groupby('card_id')['month_lag'].mean()
    df['month_lag_max'] = df.groupby('card_id')['month_lag'].max()
    df['month_lag_min'] = df.groupby('card_id')['month_lag'].min()
    # purchase_date
    # NOTE: This must be replaced with datetime time steps
    df['purchase_date_nunique'] = df.groupby('card_id')['purchase_date'].nunique()
    # category_2
    df['category_2_nunique'] = df.groupby('card_id')['category_2'].nunique()
    # state_id
    df['state_id_nunique'] = df.groupby('card_id')['state_id'].nunique()
    # subsector_id
    df['subsector_id_nunique'] = df.groupby('card_id')['subsector_id'].nunique()
    # merchant_group_id
    df['merchant_group_id_nunique'] = df.groupby('card_id')['merchant_group_id'].nunique()
    # most_recent_sales_range
    df['most_recent_sales_range_nunique'] = df.groupby('card_id')['most_recent_sales_range'].nunique()
    # most_recent_purchases_range
    df['most_recent_purchases_range_nunique'] = df.groupby('card_id')['most_recent_purchases_range'].nunique()
    
    return df

In [19]:
gdf = transactions_df.groupby("card_id")
gdf.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37
394,False,C_ID_5037ff576e,322,False,1,B,278,M_ID_b61c7d1be0,-3,-0.595260,2017-09-07,3.0,11,37
395,True,C_ID_5037ff576e,138,False,1,B,307,M_ID_fe69229f24,-4,1.189469,2017-08-14,1.0,15,19
396,True,C_ID_5037ff576e,138,False,1,B,705,M_ID_efc106141c,-9,-0.640069,2017-03-05,1.0,15,33
397,True,C_ID_5037ff576e,226,False,1,B,307,M_ID_708022307c,-4,-0.652256,2017-08-15,1.0,16,19
398,True,C_ID_5037ff576e,330,False,1,B,705,M_ID_393b4b8cec,-9,-0.674210,2017-03-26,3.0,17,33


In [None]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']
aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    new_merchant_trans[col+'_mean'] = new_merchant_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']
    
new_columns = get_new_columns('new_hist',aggs)
hist_trans_group = new_merchant_trans.groupby('card_id').agg(aggs)
hist_trans_group.columns = new_columns
hist_trans_group.reset_index(drop=False,inplace=True)
hist_trans_group['new_hist_purchase_date_diff'] = (hist_trans_group['new_hist_purchase_date_max'] - hist_trans_group['new_hist_purchase_date_min']).dt.days
hist_trans_group['new_hist_purchase_date_average'] = hist_trans_group['new_hist_purchase_date_diff']/hist_trans_group['new_hist_card_id_size']
hist_trans_group['new_hist_purchase_date_uptonow'] = (datetime.datetime.today() - hist_trans_group['new_hist_purchase_date_max']).dt.days
train = train.merge(hist_trans_group,on='card_id',how='left')
test = test.merge(hist_trans_group,on='card_id',how='left')

In [16]:
gdf = transactions_df.groupby("card_id")
gdf.head()
#gdf = gdf["purchase_amount"].size().reset_index()
#gdf.columns = ["card_id", "num_hist_transactions"]

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37
394,False,C_ID_5037ff576e,322,False,1,B,278,M_ID_b61c7d1be0,-3,-0.595260,2017-09-07,3.0,11,37
395,True,C_ID_5037ff576e,138,False,1,B,307,M_ID_fe69229f24,-4,1.189469,2017-08-14,1.0,15,19
396,True,C_ID_5037ff576e,138,False,1,B,705,M_ID_efc106141c,-9,-0.640069,2017-03-05,1.0,15,33
397,True,C_ID_5037ff576e,226,False,1,B,307,M_ID_708022307c,-4,-0.652256,2017-08-15,1.0,16,19
398,True,C_ID_5037ff576e,330,False,1,B,705,M_ID_393b4b8cec,-9,-0.674210,2017-03-26,3.0,17,33


There are a set of duplicated columns (all the couples of the form column_name_x, column_name_y). We can observe that these couples of columns can be merged to obtain a single column with the maximum amount of information. We try to do so for each couple of columns.

In [None]:
#city_id
#transactions_df['city_id'] = transactions_df['city_id_x']
#transactions_df.drop(labels = ['city_id_x', 'city_id_y'], axis = 1, inplace = True)
#transactions_df.head()

In [None]:
#category_1

In [None]:
#merchant_category_id
#transactions_df['merchant_category_id_x'] == transactions_df['merchant_category_id_y']

In [None]:
#category_2

In [None]:
#state_id

In [None]:
#subsector_id

In [23]:
cat_features = ['authorized_flag','city_id', 'category_1','category_3', 'merchant_category_id', 'merchant_id',
                'category_2', 'state_id', 'subsector_id', 'merchant_group_id', 'most_recent_sales_range', 'most_recent_purchases_range', 'category_4']
num_features = ['numerical_1', 'numerical_2','installments', 'month_lag', 'purchase_amount', 'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12']

In [None]:
# OLD
# replace mean of num_features and mode of cat_features
def replace_mean_mode(df, num_features, cat_features):
    new_df = pd.DataFrame(columns = df.columns)
    card_ids_unique = np.unique(df['card_id'])
    
    for card_id in card_ids_unique:
        new_line_num = df.loc[df['card_id'] == card_id][num_features].mean()
        new_line_cat = df.loc[df['card_id'] == card_id][cat_features].mode()
        new_df = new_df.append(pd.concat([new_line_num, new_line_cat], axis=1, join='inner'))
        
    new_df['card_id'] = card_ids_unique
    return new_df

In [None]:
transactions_df.columns

In [None]:
#new_transactions_df = replace_mean_mode(transactions_df, num_features, cat_features)

In [2]:
new_transactions_df = pd.DataFrame(columns = transactions_df.columns)
cards_ids_unique = np.unique(gdf['card_id'])

for name, group in df:
    new_line_num = group[num_features].mean()
    new_line_cat = group[cat_features].mode()
    new_transactions_df = new_transactions_df.append(pd.concat([new_line_num, new_line_cat], axis=1, join='inner'))
    
new_transactions_df['card_id'] = card_ids_unique

NameError: name 'pd' is not defined

In [1]:
new_transactions_df.head()

NameError: name 'new_transactions_df' is not defined

#### Train

In [None]:
new_train_df = pd.merge(train_df, new_transactions_df, on="card_id", how="left")


In [None]:
new_train_df.head()

In [None]:
new_train_df.shape

In [None]:
new_train_df.isnull().sum()

In [None]:
new_train_df.drop_duplicates(inplace = True)

In [None]:
new_train_df.isnull().sum()

In [None]:
new_train_df.to_csv('(2)small_train.csv', index = False)

#### Test

In [None]:
new_test_df = pd.join(test, new_transactions_df, on="card_id", how="left")

In [None]:
new_test_df.head()

In [None]:
new_test_df.to_csv('(2)small_test.csv', index = False)