In [3]:
import os
import pandas as pd
import numpy as np
from collections import Counter

# Plot libraries
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

### Import the dataframes

#### Train and test

In [4]:
# Train and test
train_df = pd.read_csv("(1)train.csv")
test_df = pd.read_csv("(1)test.csv")
train_df.shape, test_df.shape

((201917, 6), (123623, 5))

In [5]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749


#### Transactions

In [6]:
transactions_df = pd.read_csv("(1)transactions.csv")
transactions_df.shape

(30910695, 14)

In [7]:
transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37


#### Merchants

In [8]:
merchants_df = pd.read_csv("(1)merchants.csv")
merchants_df.shape

(334620, 22)

In [9]:
merchants_df.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,False,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,False,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,False,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,False,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,False,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,False,-1,5,5.0
3,M_ID_c929bb59af,9514,2,20,-0.057471,-0.057471,False,E,E,69667.0,...,69667.0,1.0,3,69667.0,1.0,3,False,-1,20,5.0
4,M_ID_dd3ae3de10,30534,278,37,-0.057471,-0.057471,False,E,E,5180.0,...,9830.0,4.166667,6,15826.0,5.6,10,True,11,23,2.0


In [10]:
#substitute inf values with 2*max_value_in_column
for i in ['3', '6', '12']:
    max_val = max(merchants_df[merchants_df != np.inf]['avg_purchases_lag' + i])
    merchants_df['avg_purchases_lag' + i].replace([np.inf, -np.inf], 2*max_val)

### Transactions and Merchants dataframes merging

In [11]:
print("Number of common merchant_id in merchants_df and transactions_df: %d" % (len(set(transactions_df.merchant_id.unique()).intersection(set(merchants_df.merchant_id.unique())))))
print("merchants_df samples: %d, transactions_df samples: %d" % (merchants_df.shape[0], transactions_df.shape[0]))

Number of common merchant_id in merchants_df and transactions_df: 334620
merchants_df samples: 334620, transactions_df samples: 30910695


In [12]:
for c in merchants_df.columns:
    if c not in transactions_df.columns:
        print(c)

merchant_group_id
numerical_1
numerical_2
most_recent_sales_range
most_recent_purchases_range
avg_sales_lag3
avg_purchases_lag3
active_months_lag3
avg_sales_lag6
avg_purchases_lag6
active_months_lag6
avg_sales_lag12
avg_purchases_lag12
active_months_lag12
category_4


Actually, the only meaningfull features to be merged are merchant_group_id, most_recent_sales_range and most_recent_purchases_range

merchants_df = merchants_df[['merchant_id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]]
merchants_df.columns = ['id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]
merchants_df.head()

In [13]:
transactions_df = transactions_df.merge(merchants_df, how='left', left_on=['merchant_id', 'city_id', 'category_1', 'category_2', 'merchant_category_id', 'state_id', 'subsector_id'], right_on=['merchant_id', 'city_id', 'category_1', 'category_2', 'merchant_category_id', 'state_id', 'subsector_id'])

In [14]:
len(transactions_df)

30910695

In [15]:
transactions_df.columns

Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'category_3', 'merchant_category_id', 'merchant_id', 'month_lag',
       'purchase_amount', 'purchase_date', 'category_2', 'state_id',
       'subsector_id', 'merchant_group_id', 'numerical_1', 'numerical_2',
       'most_recent_sales_range', 'most_recent_purchases_range',
       'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6',
       'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12',
       'category_4'],
      dtype='object')

## Merge transactions in train and test

In [22]:
gdf.head()
#gdf = gdf["purchase_amount"].size().reset_index()
#gdf.columns = ["card_id", "num_hist_transactions"]

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,...,1.08,1.082451,3.0,1.14,1.114135,6.0,1.19,1.156844,12.0,True
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,...,1.06,1.052071,3.0,1.06,1.058605,6.0,1.05,1.062087,12.0,True
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,...,0.98,0.974653,3.0,0.98,0.967058,6.0,0.97,0.956668,12.0,True
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,...,1.00,1.053443,3.0,0.88,0.897406,6.0,0.86,0.864394,12.0,True
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,...,1.08,1.082451,3.0,1.14,1.114135,6.0,1.19,1.156844,12.0,True
394,False,C_ID_5037ff576e,322,False,1,B,278,M_ID_b61c7d1be0,-3,-0.595260,...,,,,,,,,,,
395,True,C_ID_5037ff576e,138,False,1,B,307,M_ID_fe69229f24,-4,1.189469,...,,,,,,,,,,
396,True,C_ID_5037ff576e,138,False,1,B,705,M_ID_efc106141c,-9,-0.640069,...,,,,,,,,,,
397,True,C_ID_5037ff576e,226,False,1,B,307,M_ID_708022307c,-4,-0.652256,...,,,,,,,,,,
398,True,C_ID_5037ff576e,330,False,1,B,705,M_ID_393b4b8cec,-9,-0.674210,...,,,,,,,,,,


There are a set of duplicated columns (all the couples of the form column_name_x, column_name_y). We can observe that these couples of columns can be merged to obtain a single column with the maximum amount of information. We try to do so for each couple of columns.

In [None]:
#city_id
#transactions_df['city_id'] = transactions_df['city_id_x']
#transactions_df.drop(labels = ['city_id_x', 'city_id_y'], axis = 1, inplace = True)
#transactions_df.head()

In [None]:
#category_1

In [None]:
#merchant_category_id
#transactions_df['merchant_category_id_x'] == transactions_df['merchant_category_id_y']

In [None]:
#category_2

In [None]:
#state_id

In [None]:
#subsector_id

In [17]:
cat_features = ['authorized_flag','city_id', 'category_1','category_3', 'merchant_category_id', 'merchant_id',
                'category_2', 'state_id', 'subsector_id', 'merchant_group_id', 'most_recent_sales_range', 'most_recent_purchases_range', 'category_4']
num_features = ['numerical_1', 'numerical_2','installments', 'month_lag', 'purchase_amount', 'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12']

In [None]:
# OLD
# replace mean of num_features and mode of cat_features
def replace_mean_mode(df, num_features, cat_features):
    new_df = pd.DataFrame(columns = df.columns)
    card_ids_unique = np.unique(df['card_id'])
    
    for card_id in card_ids_unique:
        new_line_num = df.loc[df['card_id'] == card_id][num_features].mean()
        new_line_cat = df.loc[df['card_id'] == card_id][cat_features].mode()
        new_df = new_df.append(pd.concat([new_line_num, new_line_cat], axis=1, join='inner'))
        
    new_df['card_id'] = card_ids_unique
    return new_df

In [None]:
transactions_df.columns

In [None]:
#new_transactions_df = replace_mean_mode(transactions_df, num_features, cat_features)

In [25]:
gdf = transactions_df.groupby("card_id")
gdf[num_features].apply(np.mean, result_type = 'reduce')
gdf[cat_features].apply(lambda x: x.mode(), result_type = 'reduce')

TypeError: mean() got an unexpected keyword argument 'result_type'

In [21]:
#new_transactions_df = pd.DataFrame(columns = transactions_df.columns)
#cards_ids_unique = np.unique(gdf['card_id'])

#for name, group in df:
#    gdf[num_features].apply(mean)
#    gdf[cat_features].apply(mode)
#    new_transactions_df = new_transactions_df.append(pd.concat([new_line_num, new_line_cat], axis=1, join='inner'))

#df.groupby('A')['C'].apply(sum)
    
#new_transactions_df['card_id'] = card_ids_unique

In [22]:
gdf.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,...,1.08,1.082451,3.0,1.14,1.114135,6.0,1.19,1.156844,12.0,True
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,...,1.06,1.052071,3.0,1.06,1.058605,6.0,1.05,1.062087,12.0,True
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,...,0.98,0.974653,3.0,0.98,0.967058,6.0,0.97,0.956668,12.0,True
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,...,1.00,1.053443,3.0,0.88,0.897406,6.0,0.86,0.864394,12.0,True
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,...,1.08,1.082451,3.0,1.14,1.114135,6.0,1.19,1.156844,12.0,True
394,False,C_ID_5037ff576e,322,False,1,B,278,M_ID_b61c7d1be0,-3,-0.595260,...,,,,,,,,,,
395,True,C_ID_5037ff576e,138,False,1,B,307,M_ID_fe69229f24,-4,1.189469,...,,,,,,,,,,
396,True,C_ID_5037ff576e,138,False,1,B,705,M_ID_efc106141c,-9,-0.640069,...,,,,,,,,,,
397,True,C_ID_5037ff576e,226,False,1,B,307,M_ID_708022307c,-4,-0.652256,...,,,,,,,,,,
398,True,C_ID_5037ff576e,330,False,1,B,705,M_ID_393b4b8cec,-9,-0.674210,...,,,,,,,,,,


In [24]:
gdf.isnull().sum()

AttributeError: Cannot access callable attribute 'isnull' of 'DataFrameGroupBy' objects, try using the 'apply' method

#### Train

In [None]:
new_train_df = pd.merge(train_df, new_transactions_df, on="card_id", how="left")


In [None]:
new_train_df.head()

In [None]:
new_train_df.shape

In [None]:
new_train_df.isnull().sum()

In [None]:
new_train_df.drop_duplicates(inplace = True)

In [None]:
new_train_df.isnull().sum()

In [None]:
new_train_df.to_csv('(2)small_train.csv', index = False)

#### Test

In [None]:
new_test_df = pd.join(test, new_transactions_df, on="card_id", how="left")

In [None]:
new_test_df.head()

In [None]:
new_test_df.to_csv('(2)small_test.csv', index = False)