In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

# Plot libraries
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

### Import the dataframes

#### Train and test

In [2]:
# Train and test
train_df = pd.read_csv("(1)train.csv")
test_df = pd.read_csv("(1)test.csv")
train_df.shape, test_df.shape

((201917, 6), (123623, 5))

In [3]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749


#### Transactions

In [4]:
transactions_df = pd.read_csv("(1)transactions.csv")
transactions_df.shape

(30910695, 14)

In [5]:
transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37


#### Merchants

In [6]:
merchants_df = pd.read_csv("(1)merchants.csv")
merchants_df.shape

(334620, 22)

In [7]:
merchants_df.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,False,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,False,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,False,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,False,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,False,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,False,-1,5,5.0
3,M_ID_c929bb59af,9514,2,20,-0.057471,-0.057471,False,E,E,69667.0,...,69667.0,1.0,3,69667.0,1.0,3,False,-1,20,5.0
4,M_ID_dd3ae3de10,30534,278,37,-0.057471,-0.057471,False,E,E,5180.0,...,9830.0,4.166667,6,15826.0,5.6,10,True,11,23,2.0


### Transactions and Merchants dataframes merging

In [8]:
print("Number of common merchant_id in merchants_df and transactions_df: %d" % (len(set(transactions_df.merchant_id.unique()).intersection(set(merchants_df.merchant_id.unique())))))
print("merchants_df samples: %d, transactions_df samples: %d" % (merchants_df.shape[0], transactions_df.shape[0]))

Number of common merchant_id in merchants_df and transactions_df: 334620
merchants_df samples: 334620, transactions_df samples: 30910695


In [9]:
for c in merchants_df.columns:
    if c not in transactions_df.columns:
        print(c)

merchant_group_id
numerical_1
numerical_2
most_recent_sales_range
most_recent_purchases_range
avg_sales_lag3
avg_purchases_lag3
active_months_lag3
avg_sales_lag6
avg_purchases_lag6
active_months_lag6
avg_sales_lag12
avg_purchases_lag12
active_months_lag12
category_4


Actually, the only meaningfull features to be merged are merchant_group_id, most_recent_sales_range and most_recent_purchases_range

In [10]:
merchants_df = merchants_df[['merchant_id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]]
merchants_df.columns = ['id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]
merchants_df.head()

Unnamed: 0,id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range
0,M_ID_838061e48c,8353,E,E
1,M_ID_9339d880ad,3184,E,E
2,M_ID_e726bbae1e,447,E,E
3,M_ID_c929bb59af,9514,E,E
4,M_ID_dd3ae3de10,30534,E,E


### Dataframs merging

In [11]:
trans_merch_df = transactions_df.merge(merchants_df, how='left', left_on='merchant_id', right_on='id')
trans_merch_df = trans_merch_df.drop(['id'], axis=1)
trans_merch_df.shape

(30910695, 17)

In [12]:
trans_merch_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37,35.0,A,A
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16,2084.0,A,A
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37,27369.0,C,C
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34,24104.0,D,C
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37,35.0,A,A


In [13]:
trans_merch_df.isnull().sum()

authorized_flag                   0
card_id                           0
city_id                           0
category_1                        0
installments                      0
category_3                        0
merchant_category_id              0
merchant_id                       0
month_lag                         0
purchase_amount                   0
purchase_date                     0
category_2                        0
state_id                          0
subsector_id                      0
merchant_group_id              9401
most_recent_sales_range        9401
most_recent_purchases_range    9401
dtype: int64

We can drop these missing values, the percentage is quite low

In [14]:
trans_merch_df.dropna(inplace=True)
trans_merch_df.shape

(30901294, 17)

### Merging train and test in transactions 

We want only the target (every other feature will necessary lead to an overfit on itself)

#### Train

In [15]:
train_reduced_df = train_df[['card_id', "target"]]
train_reduced_df.columns = ['id', "target"]
train_reduced_df.head()

Unnamed: 0,id,target
0,C_ID_92a2005557,-0.820283
1,C_ID_3d0044924f,0.392913
2,C_ID_d639edf6cd,0.688056
3,C_ID_186d6a6901,0.142495
4,C_ID_cdbd2c0db2,-0.159749


In [16]:
train_huge_df = trans_merch_df.merge(train_reduced_df, how='left', left_on='card_id', right_on='id')
# Missing values are the same for id and target, so we can drop id here
train_huge_df = train_huge_df.drop(['id'], axis=1)
train_huge_df.shape

(30901294, 18)

In [17]:
train_huge_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range,target
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37,35.0,A,A,
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16,2084.0,A,A,
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37,27369.0,C,C,
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34,24104.0,D,C,
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37,35.0,A,A,


In [18]:
train_huge_df.isnull().sum()

authorized_flag                       0
card_id                               0
city_id                               0
category_1                            0
installments                          0
category_3                            0
merchant_category_id                  0
merchant_id                           0
month_lag                             0
purchase_amount                       0
purchase_date                         0
category_2                            0
state_id                              0
subsector_id                          0
merchant_group_id                     0
most_recent_sales_range               0
most_recent_purchases_range           0
target                         11759740
dtype: int64

#### Test

In [19]:
test_reduced_df = test_df[['card_id']]
test_reduced_df.columns = ['id']
test_reduced_df.head()

Unnamed: 0,id
0,C_ID_0ab67a22ab
1,C_ID_130fd0cbdd
2,C_ID_b709037bc5
3,C_ID_d27d835a9f
4,C_ID_2b5e3df5c2


In [20]:
test_huge_df = trans_merch_df.merge(test_reduced_df, how='left', left_on='card_id', right_on='id')
test_huge_df.shape

(30901294, 18)

In [21]:
test_huge_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range,id
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37,35.0,A,A,C_ID_4e6213e9bc
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16,2084.0,A,A,C_ID_4e6213e9bc
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37,27369.0,C,C,C_ID_4e6213e9bc
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34,24104.0,D,C,C_ID_4e6213e9bc
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37,35.0,A,A,C_ID_4e6213e9bc


In [22]:
test_huge_df.isnull().sum()

authorized_flag                       0
card_id                               0
city_id                               0
category_1                            0
installments                          0
category_3                            0
merchant_category_id                  0
merchant_id                           0
month_lag                             0
purchase_amount                       0
purchase_date                         0
category_2                            0
state_id                              0
subsector_id                          0
merchant_group_id                     0
most_recent_sales_range               0
most_recent_purchases_range           0
id                             19141554
dtype: int64

In [23]:
# Check the missing values
print("In total there are %d missing values" % (train_huge_df.target.isnull().sum() + test_huge_df.id.isnull().sum()))
print("The number of samples from which train and test huge dataframe have been created is %d" % len(trans_merch_df))

In total there are 30901294 missing values
The number of samples from which train and test huge dataframe have been created is 30901294


In [24]:
train_huge_df.dropna(subset=['target'], inplace=True)
test_huge_df.dropna(subset=['id'], inplace=True)
# Now we can drop the id column
test_huge_df = test_huge_df.drop(['id'], axis=1)
train_huge_df.shape, test_huge_df.shape

((19141554, 18), (11759740, 17))

In [25]:
# Check if there are common card_id between train and test and if this value has more than one occurrence in the sets
print("Number of unique visitors in train set : ",train_huge_df.card_id.nunique(), " out of rows : ",train_huge_df.shape[0])
print("Number of unique visitors in test set : ",test_huge_df.card_id.nunique(), " out of rows : ",test_huge_df.shape[0])
print("Number of common visitors in train and test set : ",len(set(train_huge_df.card_id.unique()).intersection(set(test_huge_df.card_id.unique()))))

Number of unique visitors in train set :  201917  out of rows :  19141554
Number of unique visitors in test set :  123623  out of rows :  11759740
Number of common visitors in train and test set :  0


### Export huge dataframes

In [26]:
train_huge_df.to_csv('(2)huge_train.csv', index = False)
test_huge_df.to_csv('(2)huge_test.csv', index = False)

In [None]:
gdf = historical_transactions_df.groupby("card_id")
gdf = gdf["purchase_amount"].size().reset_index()
gdf.columns = ["card_id", "num_hist_transactions"]
train_df = pd.merge(train_df, gdf, on="card_id", how="left")
test_df = pd.merge(test_df, gdf, on="card_id", how="left")

In [None]:
df.groupby(['Mt'], sort=False)['count'].max()

In [None]:
gdf = historical_transactions_df.groupby("card_id")
gdf = gdf["purchase_amount"].size().reset_index()
gdf.columns = ["card_id", "num_hist_transactions"]
train_df = pd.merge(train_df, gdf, on="card_id", how="left")
test_df = pd.merge(test_df, gdf, on="card_id", how="left")

### to do
prina di mergiare historical and new transactions con train e test, controllare che ci siano tutti i card_id, per train e per test, sia in historical che in new. Altrimenti conviene avere un unico dataframe transactions