In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

# Plot libraries
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

### Import the dataframes

#### Train and test

In [2]:
# Train and test
train_df = pd.read_csv("(1)train.csv")
test_df = pd.read_csv("(1)test.csv")
train_df.shape, test_df.shape

((201917, 6), (123623, 5))

In [3]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749


#### Transactions

In [4]:
transactions_df = pd.read_csv("(1)transactions.csv")
transactions_df.shape

(30910695, 14)

In [5]:
transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37
1,True,C_ID_4e6213e9bc,88,False,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16
2,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37
3,True,C_ID_4e6213e9bc,88,False,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34
4,True,C_ID_4e6213e9bc,88,False,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37


#### Merchants

In [6]:
merchants_df = pd.read_csv("(1)merchants.csv")
merchants_df.shape

(334620, 22)

In [7]:
merchants_df.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,False,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,False,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,False,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,False,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,False,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,False,-1,5,5.0
3,M_ID_c929bb59af,9514,2,20,-0.057471,-0.057471,False,E,E,69667.0,...,69667.0,1.0,3,69667.0,1.0,3,False,-1,20,5.0
4,M_ID_dd3ae3de10,30534,278,37,-0.057471,-0.057471,False,E,E,5180.0,...,9830.0,4.166667,6,15826.0,5.6,10,True,11,23,2.0


In [8]:
#substitute inf values with 2*max_value_in_column
for i in ['3', '6', '12']:
    max_val = max(merchants_df[merchants_df != np.inf]['avg_purchases_lag' + i])
    merchants_df['avg_purchases_lag' + i].replace([np.inf, -np.inf], 2*max_val)

### Transactions and Merchants dataframes merging

In [9]:
print("Number of common merchant_id in merchants_df and transactions_df: %d" % (len(set(transactions_df.merchant_id.unique()).intersection(set(merchants_df.merchant_id.unique())))))
print("merchants_df samples: %d, transactions_df samples: %d" % (merchants_df.shape[0], transactions_df.shape[0]))

Number of common merchant_id in merchants_df and transactions_df: 334620
merchants_df samples: 334620, transactions_df samples: 30910695


In [10]:
for c in merchants_df.columns:
    if c not in transactions_df.columns:
        print(c)

merchant_group_id
numerical_1
numerical_2
most_recent_sales_range
most_recent_purchases_range
avg_sales_lag3
avg_purchases_lag3
active_months_lag3
avg_sales_lag6
avg_purchases_lag6
active_months_lag6
avg_sales_lag12
avg_purchases_lag12
active_months_lag12
category_4


Actually, the only meaningfull features to be merged are merchant_group_id, most_recent_sales_range and most_recent_purchases_range

merchants_df = merchants_df[['merchant_id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]]
merchants_df.columns = ['id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range"]
merchants_df.head()

In [11]:
transactions_df = transactions_df.merge(merchants_df, how='left', left_on=['merchant_id', 'city_id', 'category_1', 'category_2', 'merchant_category_id', 'state_id', 'subsector_id'], right_on=['merchant_id', 'city_id', 'category_1', 'category_2', 'merchant_category_id', 'state_id', 'subsector_id'])

In [12]:
len(transactions_df)

30910695

In [13]:
transactions_df.columns

Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'category_3', 'merchant_category_id', 'merchant_id', 'month_lag',
       'purchase_amount', 'purchase_date', 'category_2', 'state_id',
       'subsector_id', 'merchant_group_id', 'numerical_1', 'numerical_2',
       'most_recent_sales_range', 'most_recent_purchases_range',
       'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6',
       'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12',
       'category_4'],
      dtype='object')

## Merge transactions in train and test

In [14]:
#gdf.head()
#gdf = gdf["purchase_amount"].size().reset_index()
#gdf.columns = ["card_id", "num_hist_transactions"]

There are a set of duplicated columns (all the couples of the form column_name_x, column_name_y). We can observe that these couples of columns can be merged to obtain a single column with the maximum amount of information. We try to do so for each couple of columns.

In [15]:
#city_id
#transactions_df['city_id'] = transactions_df['city_id_x']
#transactions_df.drop(labels = ['city_id_x', 'city_id_y'], axis = 1, inplace = True)
#transactions_df.head()

In [16]:
#category_1

In [17]:
#merchant_category_id
#transactions_df['merchant_category_id_x'] == transactions_df['merchant_category_id_y']

In [18]:
#category_2

In [19]:
#state_id

In [20]:
#subsector_id

In [21]:
cat_features = ['purchase_date', 'authorized_flag','city_id', 'category_1','category_3', 'merchant_category_id', 'merchant_id',
                'category_2', 'state_id', 'subsector_id', 'most_recent_sales_range', 'most_recent_purchases_range', 'category_4']
num_features = ['numerical_1', 'numerical_2','installments', 'month_lag', 'purchase_amount', 'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12']

In [22]:
# OLD
# replace mean of num_features and mode of cat_features
def replace_mean_mode(df, num_features, cat_features):
    new_df = pd.DataFrame(columns = df.columns)
    card_ids_unique = np.unique(df['card_id'])
    
    for card_id in card_ids_unique:
        new_line_num = df.loc[df['card_id'] == card_id][num_features].mean()
        new_line_cat = df.loc[df['card_id'] == card_id][cat_features].mode()
        new_df = new_df.append(pd.concat([new_line_num, new_line_cat], axis=1, join='inner'))
        
    new_df['card_id'] = card_ids_unique
    return new_df

In [23]:
transactions_df.columns

Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'category_3', 'merchant_category_id', 'merchant_id', 'month_lag',
       'purchase_amount', 'purchase_date', 'category_2', 'state_id',
       'subsector_id', 'merchant_group_id', 'numerical_1', 'numerical_2',
       'most_recent_sales_range', 'most_recent_purchases_range',
       'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6',
       'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12',
       'category_4'],
      dtype='object')

In [24]:
#new_transactions_df = replace_mean_mode(transactions_df, num_features, cat_features)

In [25]:
gdf = transactions_df.groupby("card_id")
gdf_num = gdf[num_features].apply(lambda x : np.mean(x))

In [26]:
gdf_cat = gdf[cat_features].apply(lambda x: x.mode())

In [27]:
#new_transactions_df = pd.DataFrame(columns = transactions_df.columns)
#cards_ids_unique = np.unique(gdf['card_id'])

#for name, group in df:
#    gdf[num_features].apply(mean)
#    gdf[cat_features].apply(mode)
#    new_transactions_df = new_transactions_df.append(pd.concat([new_line_num, new_line_cat], axis=1, join='inner'))

#df.groupby('A')['C'].apply(sum)
    
#new_transactions_df['card_id'] = card_ids_unique

In [28]:
gdf_num.head()

Unnamed: 0_level_0,numerical_1,numerical_2,installments,month_lag,purchase_amount,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C_ID_00007093c1,0.190402,0.157132,1.284768,-5.748344,-0.517706,0.994667,1.013273,3.0,0.992444,1.00357,6.0,0.970222,0.994065,12.0
C_ID_0001238066,9.879721,9.790741,1.614865,-1.277027,-0.585114,1.028462,1.05033,3.0,1.047094,1.073136,6.0,1.079658,1.1008,11.982906
C_ID_0001506ef0,1.191807,1.028121,0.014925,-4.746269,-0.527371,1.052909,1.072006,3.0,1.071636,1.094685,6.0,1.080909,1.115597,12.0
C_ID_0001793786,2.83839,2.747739,0.020243,-2.744939,-0.149861,1.307,1.204146,3.0,1.510571,1.353418,6.0,1.600286,1.432265,11.957143
C_ID_000183fdda,0.189379,0.13469,1.806452,-2.187097,-0.486637,1.047778,1.046282,3.0,1.153333,1.0725,6.0,1.103889,1.053809,12.0


In [29]:
gdf_cat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,purchase_date,authorized_flag,city_id,category_1,category_3,merchant_category_id,merchant_id,category_2,state_id,subsector_id,most_recent_sales_range,most_recent_purchases_range,category_4
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C_ID_00007093c1,0,2017-04-17,True,244.0,False,B,307.0,M_ID_9400cf2342,3.0,2.0,19.0,A,A,True
C_ID_00007093c1,1,2017-04-19,,,,,,,,,,,,
C_ID_00007093c1,2,2018-01-08,,,,,,,,,,,,
C_ID_0001238066,0,2017-12-24,True,314.0,False,B,307.0,M_ID_d17aabd756,1.0,9.0,19.0,C,C,False
C_ID_0001238066,1,2018-01-23,,,,,,,,,,,,


In [30]:
gdf_num.isnull().sum()

numerical_1            13548
numerical_2            13548
installments               0
month_lag                  0
purchase_amount            0
avg_sales_lag3         13548
avg_purchases_lag3     13548
active_months_lag3     13548
avg_sales_lag6         13548
avg_purchases_lag6     13548
active_months_lag6     13548
avg_sales_lag12        13548
avg_purchases_lag12    13548
active_months_lag12    13548
dtype: int64

In [31]:
gdf_cat.isnull().sum()

purchase_date                  121765
authorized_flag                429805
city_id                        426313
category_1                     429428
category_3                     428226
merchant_category_id           392430
merchant_id                    328101
category_2                     428794
state_id                       428434
subsector_id                   402039
most_recent_sales_range        415410
most_recent_purchases_range    414691
category_4                     440146
dtype: int64

In [32]:
gdf_cat.dropna(axis=0, how='any', inplace = True)

In [33]:
gdf_cat

Unnamed: 0_level_0,Unnamed: 1_level_0,purchase_date,authorized_flag,city_id,category_1,category_3,merchant_category_id,merchant_id,category_2,state_id,subsector_id,most_recent_sales_range,most_recent_purchases_range,category_4
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C_ID_00007093c1,0,2017-04-17,True,244.0,False,B,307.0,M_ID_9400cf2342,3.0,2.0,19.0,A,A,True
C_ID_0001238066,0,2017-12-24,True,314.0,False,B,307.0,M_ID_d17aabd756,1.0,9.0,19.0,C,C,False
C_ID_0001506ef0,0,2017-12-09,True,137.0,False,A,705.0,M_ID_b1fc88154d,3.0,19.0,33.0,A,A,True
C_ID_0001793786,0,2017-09-14,True,179.0,False,A,278.0,M_ID_923d57de8d,6.0,-1.0,37.0,B,D,False
C_ID_000183fdda,0,2018-02-01,True,161.0,False,B,367.0,M_ID_f9cfe0a43b,3.0,3.0,16.0,B,C,True
C_ID_00024e244b,0,2017-08-04,True,156.0,False,A,705.0,M_ID_8f71be2af9,3.0,7.0,33.0,C,D,False
C_ID_0002709b5a,0,2017-06-14,True,103.0,False,B,705.0,M_ID_543dd11ce2,2.0,18.0,33.0,A,A,True
C_ID_00027503e2,0,2017-06-07,True,146.0,False,A,705.0,M_ID_8de747a1c2,3.0,19.0,33.0,C,C,False
C_ID_000298032a,0,2017-07-18,True,233.0,False,A,560.0,M_ID_8978193c15,1.0,9.0,34.0,B,D,False
C_ID_0002ba3c2e,0,2017-09-01,True,344.0,False,A,705.0,M_ID_ec1175604c,2.0,18.0,33.0,C,D,True


In [34]:
gdf_num.dropna(axis=0, how='any', inplace = True)

In [35]:
gdf_num

Unnamed: 0_level_0,numerical_1,numerical_2,installments,month_lag,purchase_amount,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C_ID_00007093c1,0.190402,0.157132,1.284768,-5.748344,-0.517706,0.994667,1.013273,3.0,0.992444,1.003570,6.000000,0.970222,0.994065,12.000000
C_ID_0001238066,9.879721,9.790741,1.614865,-1.277027,-0.585114,1.028462,1.050330,3.0,1.047094,1.073136,6.000000,1.079658,1.100800,11.982906
C_ID_0001506ef0,1.191807,1.028121,0.014925,-4.746269,-0.527371,1.052909,1.072006,3.0,1.071636,1.094685,6.000000,1.080909,1.115597,12.000000
C_ID_0001793786,2.838390,2.747739,0.020243,-2.744939,-0.149861,1.307000,1.204146,3.0,1.510571,1.353418,6.000000,1.600286,1.432265,11.957143
C_ID_000183fdda,0.189379,0.134690,1.806452,-2.187097,-0.486637,1.047778,1.046282,3.0,1.153333,1.072500,6.000000,1.103889,1.053809,12.000000
C_ID_00024e244b,0.394619,0.334390,0.071429,-6.342857,-0.593611,1.125672,1.087163,3.0,1.158806,1.119176,6.000000,1.211940,1.157157,12.000000
C_ID_0002709b5a,0.510012,0.313463,1.710526,-4.105263,-0.653663,1.051176,1.073744,3.0,1.025882,1.065267,6.000000,1.048824,1.091388,11.529412
C_ID_00027503e2,-0.004765,-0.006852,0.000000,-4.142857,-0.741025,0.999211,0.992998,3.0,1.015789,1.008729,6.000000,1.027632,1.015824,12.000000
C_ID_000298032a,0.237993,0.114652,0.000000,-3.903226,-0.536590,1.083600,1.081285,3.0,1.065600,1.070506,6.000000,1.074800,1.078222,12.000000
C_ID_0002ba3c2e,-0.057471,-0.057471,0.000000,-3.123288,-0.635809,0.957143,0.974126,3.0,0.970000,0.988430,6.000000,0.968571,0.985746,12.000000


In [36]:
final_transactions_df = pd.merge(gdf_num, gdf_cat, on='card_id')

In [37]:
final_transactions_df

Unnamed: 0_level_0,numerical_1,numerical_2,installments,month_lag,purchase_amount,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,...,category_1,category_3,merchant_category_id,merchant_id,category_2,state_id,subsector_id,most_recent_sales_range,most_recent_purchases_range,category_4
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C_ID_00007093c1,0.190402,0.157132,1.284768,-5.748344,-0.517706,0.994667,1.013273,3.0,0.992444,1.003570,...,False,B,307.0,M_ID_9400cf2342,3.0,2.0,19.0,A,A,True
C_ID_0001238066,9.879721,9.790741,1.614865,-1.277027,-0.585114,1.028462,1.050330,3.0,1.047094,1.073136,...,False,B,307.0,M_ID_d17aabd756,1.0,9.0,19.0,C,C,False
C_ID_0001506ef0,1.191807,1.028121,0.014925,-4.746269,-0.527371,1.052909,1.072006,3.0,1.071636,1.094685,...,False,A,705.0,M_ID_b1fc88154d,3.0,19.0,33.0,A,A,True
C_ID_0001793786,2.838390,2.747739,0.020243,-2.744939,-0.149861,1.307000,1.204146,3.0,1.510571,1.353418,...,False,A,278.0,M_ID_923d57de8d,6.0,-1.0,37.0,B,D,False
C_ID_000183fdda,0.189379,0.134690,1.806452,-2.187097,-0.486637,1.047778,1.046282,3.0,1.153333,1.072500,...,False,B,367.0,M_ID_f9cfe0a43b,3.0,3.0,16.0,B,C,True
C_ID_00024e244b,0.394619,0.334390,0.071429,-6.342857,-0.593611,1.125672,1.087163,3.0,1.158806,1.119176,...,False,A,705.0,M_ID_8f71be2af9,3.0,7.0,33.0,C,D,False
C_ID_0002709b5a,0.510012,0.313463,1.710526,-4.105263,-0.653663,1.051176,1.073744,3.0,1.025882,1.065267,...,False,B,705.0,M_ID_543dd11ce2,2.0,18.0,33.0,A,A,True
C_ID_00027503e2,-0.004765,-0.006852,0.000000,-4.142857,-0.741025,0.999211,0.992998,3.0,1.015789,1.008729,...,False,A,705.0,M_ID_8de747a1c2,3.0,19.0,33.0,C,C,False
C_ID_000298032a,0.237993,0.114652,0.000000,-3.903226,-0.536590,1.083600,1.081285,3.0,1.065600,1.070506,...,False,A,560.0,M_ID_8978193c15,1.0,9.0,34.0,B,D,False
C_ID_0002ba3c2e,-0.057471,-0.057471,0.000000,-3.123288,-0.635809,0.957143,0.974126,3.0,0.970000,0.988430,...,False,A,705.0,M_ID_ec1175604c,2.0,18.0,33.0,C,D,True


#### Train

In [38]:
new_train_df = pd.merge(train_df, final_transactions_df, on="card_id", how="left")


In [39]:
new_train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,numerical_1,numerical_2,installments,month_lag,...,category_1,category_3,merchant_category_id,merchant_id,category_2,state_id,subsector_id,most_recent_sales_range,most_recent_purchases_range,category_4
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,18.290382,17.688792,0.01444,-3.530686,...,False,A,560.0,M_ID_1a81c358a3,1.0,9.0,34.0,C,A,True
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,30.047451,29.385934,1.542135,-4.921348,...,False,B,307.0,M_ID_940fb4498f,1.0,9.0,34.0,A,A,True
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,0.814297,0.637316,0.0,-8.363636,...,False,A,705.0,M_ID_5634fd83e0,5.0,5.0,33.0,A,A,True
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,4.787462,4.687412,1.059524,-2.452381,...,False,B,278.0,M_ID_00a6ca8a8a,4.0,22.0,37.0,A,A,True
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,3.765944,3.705154,1.285714,-0.678571,...,False,B,278.0,M_ID_cecefd9589,4.0,22.0,37.0,D,D,True


In [40]:
new_train_df.shape

(201917, 33)

In [41]:
new_train_df.isnull().sum()

first_active_month                0
card_id                           0
feature_1                         0
feature_2                         0
feature_3                         0
target                            0
numerical_1                    8317
numerical_2                    8317
installments                   8317
month_lag                      8317
purchase_amount                8317
avg_sales_lag3                 8317
avg_purchases_lag3             8317
active_months_lag3             8317
avg_sales_lag6                 8317
avg_purchases_lag6             8317
active_months_lag6             8317
avg_sales_lag12                8317
avg_purchases_lag12            8317
active_months_lag12            8317
purchase_date                  8317
authorized_flag                8317
city_id                        8317
category_1                     8317
category_3                     8317
merchant_category_id           8317
merchant_id                    8317
category_2                  

In [42]:
#save card_id of rows with at least one null value in a variable before dropping those rows
null_train_ids = new_train_df[new_train_df.isnull().any(axis=1)]['card_id']

In [43]:
#new_train_df.drop_duplicates(inplace = True)

In [44]:
#this is actually not necessary, since we will use the original (1)train.csv to train the model for this case in the test set
#new_null_train_df = train_df[train_df.card_id in null_train_ids]

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [45]:
new_train_df.dropna(axis=0, how='any', inplace = True)

In [46]:
new_train_df.to_csv('(2)small_train.csv', index = False)

#### Test

In [47]:
new_test_df = pd.merge(test_df, final_transactions_df, on="card_id", how="left")

In [48]:
new_test_df.isnull().sum()

first_active_month                0
card_id                           0
feature_1                         0
feature_2                         0
feature_3                         0
numerical_1                    5231
numerical_2                    5231
installments                   5231
month_lag                      5231
purchase_amount                5231
avg_sales_lag3                 5231
avg_purchases_lag3             5231
active_months_lag3             5231
avg_sales_lag6                 5231
avg_purchases_lag6             5231
active_months_lag6             5231
avg_sales_lag12                5231
avg_purchases_lag12            5231
active_months_lag12            5231
purchase_date                  5231
authorized_flag                5231
city_id                        5231
category_1                     5231
category_3                     5231
merchant_category_id           5231
merchant_id                    5231
category_2                     5231
state_id                    

In [49]:
#sum(test_df['card_id'].isin(gdf['card_id']) == False)

C'è un problema: 5231 card_id del test non corrispondono a alcuna entry in final_trasactions_df. Forse ce ne siamo persi alcuni per strada, o forse semplicemente non c'erano dall'inizio. Per risolvere il problema, i card_id dei 5231 sono salvati in una variabile e le righe del test set originale corrispondenti salvate in un dataframe. Così facendo, possiamo usare le righe complete per trainare un modello in grado di fare predizioni su tutte le righe del test set tranne le 5231 in questione, che vengono predette da un algoritmo trainato sul train set originale (senza informazioni estratte da merchants e/o transactions)

In [50]:
null_test_ids = new_test_df[new_test_df.isnull().any(axis=1)]['card_id']

In [51]:
new_test_df.dropna(axis=0, how='any', inplace = True)

In [52]:
new_null_test_df = test_df.loc[test_df.card_id.isin(null_test_ids)]

In [53]:
new_null_test_df

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
71,2017-02-01,C_ID_ac88d17815,2,2,0
138,2017-08-01,C_ID_e1e8acdab6,3,2,1
153,2015-02-01,C_ID_d99a6ea9e0,3,3,1
167,2017-02-01,C_ID_f9bd6c67b0,3,3,1
171,2016-04-01,C_ID_4bf0426f57,4,2,0
196,2017-01-01,C_ID_fb28b161c5,4,2,0
231,2017-10-01,C_ID_5f1eda7477,2,1,0
271,2017-12-01,C_ID_004885878b,3,2,1
354,2016-12-01,C_ID_2f1f3ea8c6,2,2,0
357,2017-06-01,C_ID_c02e054feb,3,3,1


In [54]:
new_test_df.to_csv('(2)small_test.csv', index = False)

In [55]:
new_null_test_df.to_csv('(2)small_null_test.csv', index = False)