In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

# Plot libraries
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

### Import the dataframes

#### Train and test

In [2]:
# Train and test
train_df = pd.read_csv("(1)train.csv")
test_df = pd.read_csv("(1)test.csv")
train_df.shape, test_df.shape

((201917, 7), (123623, 6))

In [3]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,elapsed_time
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,245
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,396
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,549
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,153
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,92


#### Transactions

In [4]:
transactions_df = pd.read_csv("(1)transactions.csv")
transactions_df.shape

(30910695, 28)

In [5]:
transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,...,category_2_6.0,category_3_A,category_3_B,category_3_C,category_3_D,day_of_week,day_of_month,week_of_year,month,weekend
0,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,...,0,1,0,0,0,6,25,25,6,1
1,True,C_ID_4e6213e9bc,88,False,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,...,0,1,0,0,0,5,15,28,7,1
2,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,...,0,1,0,0,0,2,9,32,8,0
3,True,C_ID_4e6213e9bc,88,False,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,...,0,1,0,0,0,5,2,35,9,1
4,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,...,0,1,0,0,0,4,10,10,3,0


#### Merchants

In [13]:
merchants_df = pd.read_csv("(1)merchants.csv")
merchants_df.shape

(334620, 22)

In [14]:
merchants_df.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,False,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,False,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,False,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,False,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,False,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,False,-1,5,5.0
3,M_ID_c929bb59af,9514,2,20,-0.057471,-0.057471,False,E,E,69667.0,...,69667.0,1.0,3,69667.0,1.0,3,False,-1,20,5.0
4,M_ID_dd3ae3de10,30534,278,37,-0.057471,-0.057471,False,E,E,5180.0,...,9830.0,4.166667,6,15826.0,5.6,10,True,11,23,2.0


### Transactions and Merchants dataframes merging

In [15]:
print("Number of common merchant_id in merchants_df and transactions_df: %d" % (len(set(transactions_df.merchant_id.unique()).intersection(set(merchants_df.merchant_id.unique())))))
print("merchants_df samples: %d, transactions_df samples: %d" % (merchants_df.shape[0], transactions_df.shape[0]))

Number of common merchant_id in merchants_df and transactions_df: 334620
merchants_df samples: 334620, transactions_df samples: 30910695


In [16]:
for c in merchants_df.columns:
    if c not in transactions_df.columns:
        print(c)

merchant_group_id
numerical_1
numerical_2
most_recent_sales_range
most_recent_purchases_range
avg_sales_lag3
avg_purchases_lag3
active_months_lag3
avg_sales_lag6
avg_purchases_lag6
active_months_lag6
avg_sales_lag12
avg_purchases_lag12
active_months_lag12
category_4
category_2


Actually, the only meaningfull features to be merged are merchant_group_id, most_recent_sales_range and most_recent_purchases_range

In [11]:
# here we can include more features

In [17]:
merchants_df = merchants_df[[
    "merchant_id", "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range", "avg_purchases_lag3",
    "avg_sales_lag3", "avg_purchases_lag3", "active_months_lag3", "avg_sales_lag6", "avg_purchases_lag6", 
    "active_months_lag6", "avg_sales_lag12", "avg_purchases_lag12", "active_months_lag12"
]]
merchants_df.columns = ['id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range", "avg_purchases_lag3",
    "avg_sales_lag3", "avg_purchases_lag3", "active_months_lag3", "avg_sales_lag6", "avg_purchases_lag6", 
    "active_months_lag6", "avg_sales_lag12", "avg_purchases_lag12", "active_months_lag12"]
merchants_df.head()

Unnamed: 0,id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range,avg_purchases_lag3,avg_sales_lag3,avg_purchases_lag3.1,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12
0,M_ID_838061e48c,8353,E,E,9.666667,-0.4,9.666667,3,-2.25,18.666667,6,-2.32,13.916667,12
1,M_ID_9339d880ad,3184,E,E,1.75,-0.72,1.75,3,-0.74,1.291667,6,-0.57,1.6875,12
2,M_ID_e726bbae1e,447,E,E,260.0,-82.13,260.0,2,-82.13,260.0,2,-82.13,260.0,2
3,M_ID_c929bb59af,9514,E,E,1.0,69667.0,1.0,3,69667.0,1.0,3,69667.0,1.0,3
4,M_ID_dd3ae3de10,30534,E,E,4.0,5180.0,4.0,3,9830.0,4.166667,6,15826.0,5.6,10


In [18]:
trans_merch_df = transactions_df.merge(merchants_df, how='left', left_on='merchant_id', right_on='id')
trans_merch_df = trans_merch_df.drop(['id'], axis=1)
trans_merch_df.shape

(30910695, 41)

In [20]:
trans_merch_df.head(10)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,...,avg_purchases_lag3,avg_sales_lag3,avg_purchases_lag3.1,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12
0,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,...,1.082451,1.08,1.082451,3.0,1.14,1.114135,6.0,1.19,1.156844,12.0
1,True,C_ID_4e6213e9bc,88,False,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,...,1.052071,1.06,1.052071,3.0,1.06,1.058605,6.0,1.05,1.062087,12.0
2,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,...,0.974653,0.98,0.974653,3.0,0.98,0.967058,6.0,0.97,0.956668,12.0
3,True,C_ID_4e6213e9bc,88,False,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,...,1.053443,1.0,1.053443,3.0,0.88,0.897406,6.0,0.86,0.864394,12.0
4,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,...,1.082451,1.08,1.082451,3.0,1.14,1.114135,6.0,1.19,1.156844,12.0
5,True,C_ID_4e6213e9bc,333,False,0,80,M_ID_50af771f8d,0,-0.734887,2018-02-24,...,1.019285,0.98,1.019285,3.0,0.94,0.961773,6.0,0.97,0.982693,12.0
6,True,C_ID_4e6213e9bc,88,False,0,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21,...,1.108094,1.11,1.108094,3.0,1.16,1.156869,6.0,1.22,1.208972,12.0
7,True,C_ID_4e6213e9bc,3,False,0,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18,...,1.075861,1.06,1.075861,3.0,1.08,1.098235,6.0,1.1,1.10459,12.0
8,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01,...,0.974653,0.98,0.974653,3.0,0.98,0.967058,6.0,0.97,0.956668,12.0
9,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16,...,0.975325,0.96,0.975325,3.0,1.02,0.947403,6.0,1.09,0.977381,12.0


In [24]:
trans_merch_df.isnull().sum()

authorized_flag                   0
card_id                           0
city_id                           0
category_1                        0
installments                      0
merchant_category_id              0
merchant_id                       0
month_lag                         0
purchase_amount                   0
purchase_date                     0
state_id                          0
subsector_id                      0
month_diff                        0
category_2_1.0                    0
category_2_2.0                    0
category_2_3.0                    0
category_2_4.0                    0
category_2_5.0                    0
category_2_6.0                    0
category_3_A                      0
category_3_B                      0
category_3_C                      0
category_3_D                      0
day_of_week                       0
day_of_month                      0
week_of_year                      0
month                             0
weekend                     

In [25]:
trans_merch_df.dropna(inplace=True)
trans_merch_df.shape

(30901294, 41)

In [17]:
%%time
# This function can be used to compute the mode with groupby()
# prova = trans_merch_df.groupby('card_id')['city_id'].apply(lambda x: x.mode())

Wall time: 0 ns


## Merge transactions in train and test

In [18]:
print("Number of common card_id in train_df and trans_merch_df: %d" % (len(set(trans_merch_df.card_id.unique()).intersection(set(train_df.card_id.unique())))))
print("train_df samples: %d, trans_merch_df samples: %d" % (train_df.shape[0], trans_merch_df.shape[0]))

print("Number of common card_id in test_df and trans_merch_df: %d" % (len(set(trans_merch_df.card_id.unique()).intersection(set(test_df.card_id.unique())))))
print("test_df samples: %d, trans_merch_df samples: %d" % (test_df.shape[0], trans_merch_df.shape[0]))

Number of common card_id in train_df and trans_merch_df: 201917
train_df samples: 201917, trans_merch_df samples: 30901294
Number of common card_id in test_df and trans_merch_df: 123623
test_df samples: 123623, trans_merch_df samples: 30901294


In [40]:
def binary_aggregation(df, new_df, columns):
    for col in columns:
        new_df[col + '_sum'] = df.groupby("card_id")[col].sum()
        new_df[col + '_mean'] = df.groupby("card_id")[col].mean()
    return new_df

def categorical_aggregation(df, new_df, columns):
    for col in columns:
        new_df[col + '_nunique'] = df.groupby('card_id')[col].nunique()
    return new_df

def numerical_aggregation(df, new_df, columns):
    for col in columns:#The problem is here <--
        new_df[col + '_sum'] = df.groupby('card_id')[col].sum()
        new_df[col + '_mean'] = df.groupby('card_id')[col].mean()
        new_df[col + '_max'] = df.groupby('card_id')[col].max()
        new_df[col + '_min'] = df.groupby('card_id')[col].min()
    return new_df

In [41]:
def aggregate_transaction_merchant_per_card_id(df):
    
    new_df = pd.DataFrame()
    
    # card_id
    new_df['transactions_count'] = df.groupby("card_id")['card_id'].count()
 
    binary_features = [
        "authorized_flag", "category_1", "category_2_1.0", "category_2_2.0", "category_2_3.0", "category_2_4.0", 
        "category_2_5.0", "category_2_6.0", "category_3_A", "category_3_B", "category_3_C", "category_3_D", 
        "weekend"]

    categorical_features = [
        "merchant_category_id", "merchant_id", "purchase_date", "state_id", "subsector_id", 
        "day_of_week", "day_of_month", "week_of_year", "month", "merchant_group_id", "most_recent_sales_range", 
        "most_recent_purchases_range"]

    numerical_features = [
        "installments", "month_lag", "purchase_amount", "month_diff", "avg_purchases_lag3", 
        "avg_sales_lag3", "avg_purchases_lag3", "active_months_lag3", "avg_sales_lag6", "avg_purchases_lag6", 
        "active_months_lag6", "avg_sales_lag12", "avg_purchases_lag12", "active_months_lag12"]
    
    new_df = binary_aggregation(df, new_df, binary_features)
    new_df = categorical_aggregation(df, new_df, categorical_features)
    new_df = numerical_aggregation(df, new_df, numerical_features)

    new_df['trans_card_id'] = new_df.index
    new_df = new_df.reset_index(drop=True)
    return new_df

In [42]:
grouped_df = aggregate_transaction_merchant_per_card_id(trans_merch_df)
grouped_df.head()

ValueError: all keys need to be the same shape

In [19]:
def transaction_merchant_card_id_group(df):
    
    new_df = pd.DataFrame()
    
    # card_id
    new_df['transactions_count'] = df.groupby("card_id")['card_id'].count()
    
    # authorized_flag
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # city_id
    new_df['city_id_nunique'] = df.groupby('card_id')['city_id'].nunique()
    # category_1
    new_df['category_1_sum'] = df.groupby("card_id")['category_1'].sum()
    new_df['category_1_mean'] = df.groupby("card_id")['category_1'].mean()
    # installments
    new_df['installments_nunique'] = df.groupby('card_id')['installments'].nunique()
    new_df['installments_sum'] = df.groupby('card_id')['installments'].sum()
    new_df['installments_mean'] = df.groupby('card_id')['installments'].mean()
    new_df['installments_max'] = df.groupby('card_id')['installments'].max()
    new_df['installments_min'] = df.groupby('card_id')['installments'].min()
    # category_3
    new_df['category_3_nunique'] = df.groupby('card_id')['category_3'].nunique()
    # merchant_category_id
    new_df['merchant_category_id_nunique'] = df.groupby('card_id')['merchant_category_id'].nunique()    
    # merchant_id
    new_df['merchant_id_nunique'] = df.groupby('card_id')['merchant_id'].nunique()    
    # month_lag
    new_df['month_lag_nunique'] = df.groupby('card_id')['month_lag'].nunique()
    new_df['month_lag_sum'] = df.groupby('card_id')['month_lag'].sum()
    new_df['month_lag_mean'] = df.groupby('card_id')['month_lag'].mean()
    new_df['month_lag_max'] = df.groupby('card_id')['month_lag'].max()
    new_df['month_lag_min'] = df.groupby('card_id')['month_lag'].min()
    # purchase_amount
    new_df['month_lag_nunique'] = df.groupby('card_id')['month_lag'].nunique()
    new_df['month_lag_sum'] = df.groupby('card_id')['month_lag'].sum()
    new_df['month_lag_mean'] = df.groupby('card_id')['month_lag'].mean()
    new_df['month_lag_max'] = df.groupby('card_id')['month_lag'].max()
    new_df['month_lag_min'] = df.groupby('card_id')['month_lag'].min()
    # purchase_date
    new_df['purchase_date_nunique'] = df.groupby('card_id')['purchase_date'].nunique()
    # category_2
    new_df['category_2_nunique'] = df.groupby('card_id')['category_2'].nunique()
    # state_id
    new_df['state_id_nunique'] = df.groupby('card_id')['state_id'].nunique()
    # subsector_id
    new_df['subsector_id_nunique'] = df.groupby('card_id')['subsector_id'].nunique()
    # merchant_group_id
    new_df['merchant_group_id_nunique'] = df.groupby('card_id')['merchant_group_id'].nunique()
    # month_diff
    new_df['month_diff_nunique'] = df.groupby('card_id')['month_diff'].nunique()
    new_df['month_diff_sum'] = df.groupby('card_id')['month_diff'].sum()
    new_df['month_diff_mean'] = df.groupby('card_id')['month_diff'].mean()
    new_df['month_diff_max'] = df.groupby('card_id')['month_diff'].max()
    new_df['month_diff_min'] = df.groupby('card_id')['month_diff'].min()
    # category_2_1.0
    new_df['category_2_1.0_sum'] = df.groupby("card_id")['category_2_1.0'].sum()
    new_df['category_2_1.0_mean'] = df.groupby("card_id")['category_2_1.0'].mean()
    # category_2_2.0
    new_df['category_2_2.0_sum'] = df.groupby("card_id")['category_2_2.0'].sum()
    new_df['category_2_2.0_mean'] = df.groupby("card_id")['category_2_2.0'].mean()
    # category_2_3.0
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # category_2_4.0
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # category_2_5.0
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # category_2_6.0
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # category_3_A
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # category_3_B
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # category_3_C
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # category_3_D
    new_df['authorized_transactions_sum'] = df.groupby("card_id")['authorized_flag'].sum()
    new_df['authorized_transactions_mean'] = df.groupby("card_id")['authorized_flag'].mean()
    # day_of_week
    
    # day_of_month
    
    # week_of_year
    
    # month
    
    # weekend
    
    # merchant_group_id
    
    # most_recent_sales_range
    new_df['most_recent_sales_range_nunique'] = df.groupby('card_id')['most_recent_sales_range'].nunique()
    # most_recent_purchases_range
    new_df['most_recent_purchases_range_nunique'] = df.groupby('card_id')['most_recent_purchases_range'].nunique()
    
    # avg_purchases_lag3
    
    # avg_sales_lag3
    
    # avg_purchases_lag3
    
    # active_months_lag3
    
    # avg_sales_lag6
    
    # avg_purchases_lag6
    
    # active_months_lag6
    
    # avg_sales_lag12
    
    # avg_purchases_lag12
    
    # active_months_lag12
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    new_df['trans_card_id'] = new_df.index
    new_df = new_df.reset_index(drop=True)
    return new_df

In [20]:
grouped_df = transaction_merchant_card_id_group(trans_merch_df)
grouped_df.head()

Unnamed: 0,transactions_count,authorized_transactions_sum,authorized_transactions_mean,city_id_nunique,category_1_sum,category_1_mean,installments_nunique,installments_sum,installments_mean,installments_max,...,month_lag_max,month_lag_min,purchase_date_nunique,category_2_nunique,state_id_nunique,subsector_id_nunique,merchant_group_id_nunique,most_recent_sales_range_nunique,most_recent_purchases_range_nunique,trans_card_id
0,151,116.0,0.768212,5,28.0,0.18543,5,194,1.284768,6,...,2,-12,83,4,4,13,15,5,5,C_ID_00007093c1
1,148,145.0,0.97973,19,4.0,0.027027,9,239,1.614865,10,...,2,-5,87,3,6,19,66,5,5,C_ID_0001238066
2,67,63.0,0.940299,3,0.0,0.0,2,1,0.014925,1,...,1,-13,45,2,2,12,19,5,5,C_ID_0001506ef0
3,247,220.0,0.890688,11,2.0,0.008097,2,5,0.020243,1,...,2,-9,98,5,5,25,104,5,5,C_ID_0001793786
4,155,148.0,0.954839,10,4.0,0.025806,8,280,1.806452,10,...,2,-6,87,5,7,21,64,5,5,C_ID_000183fdda


#### Train

In [None]:
train_small_df = train_df.merge(grouped_df, how='left', left_on='card_id', right_on='trans_card_id')
train_small_df = train_small_df.drop(['trans_card_id'], axis=1)
train_small_df.shape

In [None]:
train_small_df.head()

In [None]:
train_small_df.isnull().sum()

#### Test

In [None]:
test_small_df = test_df.merge(grouped_df, how='left', left_on='card_id', right_on='trans_card_id')
test_small_df = test_small_df.drop(['trans_card_id'], axis=1)
test_small_df.shape

In [None]:
test_small_df.head()

In [None]:
test_small_df.isnull().sum()

### Export small dataframes

In [27]:
train_small_df.to_csv('(2)small_train.csv', index = False)
test_small_df.to_csv('(2)small_test.csv', index = False)