In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

# Plot libraries
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

### Import the dataframes

#### Train and test

In [2]:
# Train and test
train_df = pd.read_csv("(1)train.csv")
test_df = pd.read_csv("(1)test.csv")
train_df.shape, test_df.shape

((201917, 7), (123623, 6))

In [3]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,elapsed_time
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,245
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,396
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,549
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,153
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,92


#### Transactions

In [4]:
transactions_df = pd.read_csv("(1)transactions.csv")
transactions_df.shape

(30910695, 28)

In [5]:
transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,...,category_2_6.0,category_3_A,category_3_B,category_3_C,category_3_D,day_of_week,day_of_month,week_of_year,month,weekend
0,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,...,0,1,0,0,0,6,25,25,6,1
1,True,C_ID_4e6213e9bc,88,False,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,...,0,1,0,0,0,5,15,28,7,1
2,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,...,0,1,0,0,0,2,9,32,8,0
3,True,C_ID_4e6213e9bc,88,False,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,...,0,1,0,0,0,5,2,35,9,1
4,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,...,0,1,0,0,0,4,10,10,3,0


#### Merchants

In [34]:
merchants_df = pd.read_csv("(1)merchants.csv")
merchants_df.shape

(334620, 22)

In [35]:
merchants_df.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,False,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,False,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,False,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,False,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,False,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,False,-1,5,5.0
3,M_ID_c929bb59af,9514,2,20,-0.057471,-0.057471,False,E,E,69667.0,...,69667.0,1.0,3,69667.0,1.0,3,False,-1,20,5.0
4,M_ID_dd3ae3de10,30534,278,37,-0.057471,-0.057471,False,E,E,5180.0,...,9830.0,4.166667,6,15826.0,5.6,10,True,11,23,2.0


### Transactions and Merchants dataframes merging

In [36]:
print("Number of common merchant_id in merchants_df and transactions_df: %d" % (len(set(transactions_df.merchant_id.unique()).intersection(set(merchants_df.merchant_id.unique())))))
print("merchants_df samples: %d, transactions_df samples: %d" % (merchants_df.shape[0], transactions_df.shape[0]))

Number of common merchant_id in merchants_df and transactions_df: 334620
merchants_df samples: 334620, transactions_df samples: 30910695


In [37]:
for c in merchants_df.columns:
    if c not in transactions_df.columns:
        print(c)

merchant_group_id
numerical_1
numerical_2
most_recent_sales_range
most_recent_purchases_range
avg_sales_lag3
avg_purchases_lag3
active_months_lag3
avg_sales_lag6
avg_purchases_lag6
active_months_lag6
avg_sales_lag12
avg_purchases_lag12
active_months_lag12
category_4
category_2


Actually, the only meaningfull features to be merged are merchant_group_id, most_recent_sales_range and most_recent_purchases_range

In [10]:
# here we can include more features

In [41]:
merchants_df = merchants_df[[
    "merchant_id", "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range",
    "avg_sales_lag3", "avg_purchases_lag3", "active_months_lag3", 
    "avg_sales_lag6", "avg_purchases_lag6", "active_months_lag6", 
    "avg_sales_lag12", "avg_purchases_lag12", "active_months_lag12"
]]
merchants_df.columns = [
    'id', "merchant_group_id", "most_recent_sales_range", "most_recent_purchases_range",
    "avg_sales_lag3", "avg_purchases_lag3", "active_months_lag3", 
    "avg_sales_lag6", "avg_purchases_lag6", "active_months_lag6", 
    "avg_sales_lag12", "avg_purchases_lag12", "active_months_lag12"
]
merchants_df.head()

Unnamed: 0,id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12
0,M_ID_838061e48c,8353,E,E,-0.4,9.666667,3,-2.25,18.666667,6,-2.32,13.916667,12
1,M_ID_9339d880ad,3184,E,E,-0.72,1.75,3,-0.74,1.291667,6,-0.57,1.6875,12
2,M_ID_e726bbae1e,447,E,E,-82.13,260.0,2,-82.13,260.0,2,-82.13,260.0,2
3,M_ID_c929bb59af,9514,E,E,69667.0,1.0,3,69667.0,1.0,3,69667.0,1.0,3
4,M_ID_dd3ae3de10,30534,E,E,5180.0,4.0,3,9830.0,4.166667,6,15826.0,5.6,10


In [42]:
trans_merch_df = transactions_df.merge(merchants_df, how='left', left_on='merchant_id', right_on='id')
trans_merch_df = trans_merch_df.drop(['id'], axis=1)
trans_merch_df.shape

(30910695, 40)

In [44]:
trans_merch_df.head(10)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,...,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12
0,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,...,A,1.08,1.082451,3.0,1.14,1.114135,6.0,1.19,1.156844,12.0
1,True,C_ID_4e6213e9bc,88,False,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,...,A,1.06,1.052071,3.0,1.06,1.058605,6.0,1.05,1.062087,12.0
2,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,...,C,0.98,0.974653,3.0,0.98,0.967058,6.0,0.97,0.956668,12.0
3,True,C_ID_4e6213e9bc,88,False,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,...,C,1.0,1.053443,3.0,0.88,0.897406,6.0,0.86,0.864394,12.0
4,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,...,A,1.08,1.082451,3.0,1.14,1.114135,6.0,1.19,1.156844,12.0
5,True,C_ID_4e6213e9bc,333,False,0,80,M_ID_50af771f8d,0,-0.734887,2018-02-24,...,A,0.98,1.019285,3.0,0.94,0.961773,6.0,0.97,0.982693,12.0
6,True,C_ID_4e6213e9bc,88,False,0,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21,...,A,1.11,1.108094,3.0,1.16,1.156869,6.0,1.22,1.208972,12.0
7,True,C_ID_4e6213e9bc,3,False,0,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18,...,C,1.06,1.075861,3.0,1.08,1.098235,6.0,1.1,1.10459,12.0
8,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01,...,C,0.98,0.974653,3.0,0.98,0.967058,6.0,0.97,0.956668,12.0
9,True,C_ID_4e6213e9bc,88,False,0,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16,...,C,0.96,0.975325,3.0,1.02,0.947403,6.0,1.09,0.977381,12.0


In [45]:
trans_merch_df.isnull().sum()

authorized_flag                   0
card_id                           0
city_id                           0
category_1                        0
installments                      0
merchant_category_id              0
merchant_id                       0
month_lag                         0
purchase_amount                   0
purchase_date                     0
state_id                          0
subsector_id                      0
month_diff                        0
category_2_1.0                    0
category_2_2.0                    0
category_2_3.0                    0
category_2_4.0                    0
category_2_5.0                    0
category_2_6.0                    0
category_3_A                      0
category_3_B                      0
category_3_C                      0
category_3_D                      0
day_of_week                       0
day_of_month                      0
week_of_year                      0
month                             0
weekend                     

In [46]:
trans_merch_df.dropna(inplace=True)
trans_merch_df.shape

(30901294, 40)

In [17]:
%%time
# This function can be used to compute the mode with groupby()
# prova = trans_merch_df.groupby('card_id')['city_id'].apply(lambda x: x.mode())

Wall time: 0 ns


## Merge transactions in train and test

In [16]:
print("Number of common card_id in train_df and trans_merch_df: %d" % (len(set(trans_merch_df.card_id.unique()).intersection(set(train_df.card_id.unique())))))
print("train_df samples: %d, trans_merch_df samples: %d" % (train_df.shape[0], trans_merch_df.shape[0]))

print("Number of common card_id in test_df and trans_merch_df: %d" % (len(set(trans_merch_df.card_id.unique()).intersection(set(test_df.card_id.unique())))))
print("test_df samples: %d, trans_merch_df samples: %d" % (test_df.shape[0], trans_merch_df.shape[0]))

Number of common card_id in train_df and trans_merch_df: 201917
train_df samples: 201917, trans_merch_df samples: 30901294
Number of common card_id in test_df and trans_merch_df: 123623
test_df samples: 123623, trans_merch_df samples: 30901294


In [17]:
def binary_aggregation(df, new_df, columns):
    for col in columns:
        new_df[col + '_sum'] = df.groupby("card_id")[col].sum()
        new_df[col + '_mean'] = df.groupby("card_id")[col].mean()
    return new_df

def categorical_aggregation(df, new_df, columns):
    for col in columns:
        new_df[col + '_nunique'] = df.groupby('card_id')[col].nunique()
    return new_df

def numerical_aggregation(df, new_df, columns):
    for col in columns:
        new_df[col + '_sum'] = df.groupby('card_id')[col].sum()
        new_df[col + '_mean'] = df.groupby('card_id')[col].mean()
        new_df[col + '_max'] = df.groupby('card_id')[col].max()
        new_df[col + '_min'] = df.groupby('card_id')[col].min()
    return new_df

In [49]:
prova = pd.DataFrame()
feature = [        "avg_sales_lag6", "avg_purchases_lag6", "active_months_lag6", 
        "avg_sales_lag12", "avg_purchases_lag12", "active_months_lag12"]
prova = numerical_aggregation(trans_merch_df, prova, feature)
prova['trans_card_id'] = prova.index
prova = prova.reset_index(drop=True)
prova

Unnamed: 0,avg_sales_lag6_sum,avg_sales_lag6_mean,avg_sales_lag6_max,avg_sales_lag6_min,avg_purchases_lag6_sum,avg_purchases_lag6_mean,avg_purchases_lag6_max,avg_purchases_lag6_min,active_months_lag6_sum,active_months_lag6_mean,...,avg_sales_lag12_min,avg_purchases_lag12_sum,avg_purchases_lag12_mean,avg_purchases_lag12_max,avg_purchases_lag12_min,active_months_lag12_sum,active_months_lag12_mean,active_months_lag12_max,active_months_lag12_min,trans_card_id
0,152.73,1.011457,1.34,0.54,158.388137,1.048928,1.911765,0.717677,906.0,6.000000,...,0.54,167.780254,1.111128,3.441176,0.549837,1800.0,11.920530,12.0,9.0,C_ID_00007093c1
1,328.39,2.218851,147.69,0.67,681.640662,4.605680,504.322881,0.738174,888.0,6.000000,...,0.53,740.656505,5.004436,554.397813,0.540334,1773.0,11.979730,12.0,10.0,C_ID_0001238066
2,70.31,1.049403,1.49,0.82,72.594945,1.083507,1.577181,0.871214,402.0,6.000000,...,0.76,74.227565,1.107874,1.516779,0.752641,804.0,12.000000,12.0,12.0,C_ID_0001506ef0
3,532.23,2.154777,147.69,0.57,1050.794180,4.254227,504.322881,0.630769,1482.0,6.000000,...,0.50,1095.170371,4.433888,554.397813,0.438104,2955.0,11.963563,12.0,9.0,C_ID_0001793786
4,493.54,3.184129,147.69,0.64,1182.141824,7.626721,504.322881,0.688131,930.0,6.000000,...,0.63,1277.622445,8.242725,554.397813,0.564815,1857.0,11.980645,12.0,9.0,C_ID_000183fdda
5,80.22,1.146000,2.03,0.78,77.598401,1.108549,2.002450,0.833333,420.0,6.000000,...,0.76,79.787424,1.139820,2.578018,0.752641,840.0,12.000000,12.0,12.0,C_ID_00024e244b
6,102.85,1.353289,4.96,0.61,126.918484,1.669980,8.742424,0.684211,456.0,6.000000,...,0.65,172.344363,2.267689,16.393939,0.584000,904.0,11.894737,12.0,8.0,C_ID_0002709b5a
7,42.13,1.003095,1.26,0.86,41.868131,0.996860,1.197016,0.871214,252.0,6.000000,...,0.76,41.789544,0.994989,1.191317,0.752641,504.0,12.000000,12.0,12.0,C_ID_00027503e2
8,32.48,1.047742,1.67,0.83,32.674580,1.054019,2.476190,0.869576,186.0,6.000000,...,0.76,32.654289,1.053364,2.798186,0.752641,372.0,12.000000,12.0,12.0,C_ID_000298032a
9,70.84,0.970411,1.78,0.47,72.392598,0.991679,2.257824,0.670068,438.0,6.000000,...,0.47,73.173270,1.002374,2.335497,0.615646,876.0,12.000000,12.0,12.0,C_ID_0002ba3c2e


In [48]:
prova.shape

(325540, 29)

In [50]:
def aggregate_transaction_merchant_per_card_id(df):
    
    new_df = pd.DataFrame()
    
    # card_id
    new_df['transactions_count'] = df.groupby("card_id")['card_id'].count()
 
    binary_features = [
        "authorized_flag", "category_1", 
        "category_2_1.0", "category_2_2.0", "category_2_3.0", "category_2_4.0", "category_2_5.0", "category_2_6.0", 
        "category_3_A", "category_3_B", "category_3_C", "category_3_D", 
        "weekend"
    ]

    categorical_features = [
        "merchant_category_id", "merchant_id", "merchant_group_id",
        "purchase_date", "state_id", "subsector_id", 
        "day_of_week", "day_of_month", "week_of_year", "month",  
        "most_recent_sales_range", "most_recent_purchases_range"
    ]

    numerical_features = [
        "installments", "month_lag", "purchase_amount", "month_diff",
        "avg_sales_lag3", "avg_purchases_lag3", "active_months_lag3", 
        "avg_sales_lag6", "avg_purchases_lag6", "active_months_lag6", 
        "avg_sales_lag12", "avg_purchases_lag12", "active_months_lag12"
    ]
    
    new_df = binary_aggregation(df, new_df, binary_features)
    new_df = categorical_aggregation(df, new_df, categorical_features)
    new_df = numerical_aggregation(df, new_df, numerical_features)
    
    # Special features
    new_df['purchase_date_min'] = df.groupby('card_id')['purchase_date'].min()
    new_df['purchase_date_max'] = df.groupby('card_id')['purchase_date'].max()
    new_df['month_lag_std'] = df.groupby('card_id')['month_lag'].std()

    new_df['trans_card_id'] = new_df.index
    new_df = new_df.reset_index(drop=True)
    return new_df

In [51]:
grouped_df = aggregate_transaction_merchant_per_card_id(trans_merch_df)
grouped_df.head()

Unnamed: 0,transactions_count,authorized_flag_sum,authorized_flag_mean,category_1_sum,category_1_mean,category_2_1.0_sum,category_2_1.0_mean,category_2_2.0_sum,category_2_2.0_mean,category_2_3.0_sum,...,avg_purchases_lag12_max,avg_purchases_lag12_min,active_months_lag12_sum,active_months_lag12_mean,active_months_lag12_max,active_months_lag12_min,purchase_date_min,purchase_date_max,month_lag_std,trans_card_id
0,151,116.0,0.768212,28.0,0.18543,1,0.006623,0,0.0,121,...,3.441176,0.549837,1800.0,11.92053,12.0,9.0,2017-02-14,2018-04-09,3.546301,C_ID_00007093c1
1,148,145.0,0.97973,4.0,0.027027,115,0.777027,0,0.0,0,...,554.397813,0.540334,1773.0,11.97973,12.0,10.0,2017-09-28,2018-04-30,1.685557,C_ID_0001238066
2,67,63.0,0.940299,0.0,0.0,2,0.029851,0,0.0,65,...,1.516779,0.752641,804.0,12.0,12.0,12.0,2017-01-14,2018-03-22,4.265234,C_ID_0001506ef0
3,247,220.0,0.890688,2.0,0.008097,26,0.105263,84,0.340081,20,...,554.397813,0.438104,2955.0,11.963563,12.0,9.0,2017-01-21,2017-12-31,2.657197,C_ID_0001793786
4,155,148.0,0.954839,4.0,0.025806,7,0.045161,1,0.006452,142,...,554.397813,0.564815,1857.0,11.980645,12.0,9.0,2017-08-07,2018-04-30,2.066389,C_ID_000183fdda


#### Train

In [52]:
train_small_df = train_df.merge(grouped_df, how='left', left_on='card_id', right_on='trans_card_id')
train_small_df = train_small_df.drop(['trans_card_id'], axis=1)
train_small_df.shape

(201917, 101)

In [53]:
train_small_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,elapsed_time,transactions_count,authorized_flag_sum,authorized_flag_mean,...,avg_purchases_lag12_mean,avg_purchases_lag12_max,avg_purchases_lag12_min,active_months_lag12_sum,active_months_lag12_mean,active_months_lag12_max,active_months_lag12_min,purchase_date_min,purchase_date_max,month_lag_std
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,245,277,265.0,0.956679,...,21.523449,554.397813,0.252963,3313.0,11.960289,12.0,5.0,2017-06-27,2018-04-29,2.727337
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,396,356,345.0,0.969101,...,4.282934,554.397813,0.209035,4266.0,11.983146,12.0,10.0,2017-01-06,2018-03-30,3.865994
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,549,44,42.0,0.954545,...,1.059354,1.541667,0.752641,526.0,11.954545,12.0,10.0,2017-01-11,2018-04-28,4.120798
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,153,84,84.0,1.0,...,1.027061,1.835911,0.436002,1004.0,11.952381,12.0,10.0,2017-09-26,2018-04-18,2.141977
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,92,168,163.0,0.970238,...,4.669668,554.397813,0.286437,2003.0,11.922619,12.0,5.0,2017-11-12,2018-04-28,1.501781


In [54]:
train_small_df.isnull().sum()

first_active_month          0
card_id                     0
feature_1                   0
feature_2                   0
feature_3                   0
target                      0
elapsed_time                0
transactions_count          0
authorized_flag_sum         0
authorized_flag_mean        0
category_1_sum              0
category_1_mean             0
category_2_1.0_sum          0
category_2_1.0_mean         0
category_2_2.0_sum          0
category_2_2.0_mean         0
category_2_3.0_sum          0
category_2_3.0_mean         0
category_2_4.0_sum          0
category_2_4.0_mean         0
category_2_5.0_sum          0
category_2_5.0_mean         0
category_2_6.0_sum          0
category_2_6.0_mean         0
category_3_A_sum            0
category_3_A_mean           0
category_3_B_sum            0
category_3_B_mean           0
category_3_C_sum            0
category_3_C_mean           0
                           ..
active_months_lag3_mean     0
active_months_lag3_max      0
active_mon

#### Test

In [55]:
test_small_df = test_df.merge(grouped_df, how='left', left_on='card_id', right_on='trans_card_id')
test_small_df = test_small_df.drop(['trans_card_id'], axis=1)
test_small_df.shape

(123623, 100)

In [56]:
test_small_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,elapsed_time,transactions_count,authorized_flag_sum,authorized_flag_mean,category_1_sum,...,avg_purchases_lag12_mean,avg_purchases_lag12_max,avg_purchases_lag12_min,active_months_lag12_sum,active_months_lag12_mean,active_months_lag12_max,active_months_lag12_min,purchase_date_min,purchase_date_max,month_lag_std
0,2017-04-01,C_ID_0ab67a22ab,3,3,1,306.0,71,47.0,0.661972,23.0,...,0.995958,6.346154,0.584,852.0,12.0,12.0,12.0,2017-04-04,2018-02-28,2.659102
1,2017-01-01,C_ID_130fd0cbdd,2,3,0,396.0,87,86.0,0.988506,4.0,...,1.069059,4.247402,0.710589,1040.0,11.954023,12.0,8.0,2017-01-13,2018-04-20,4.172247
2,2017-08-01,C_ID_b709037bc5,5,1,1,184.0,15,11.0,0.733333,2.0,...,1.104988,1.452443,0.584,180.0,12.0,12.0,12.0,2017-08-25,2018-03-13,1.9518
3,2017-12-01,C_ID_d27d835a9f,2,1,0,62.0,36,36.0,1.0,1.0,...,1.069105,1.3273,0.584,432.0,12.0,12.0,12.0,2017-12-04,2018-04-17,1.424001
4,2015-12-01,C_ID_2b5e3df5c2,5,1,1,793.0,116,93.0,0.801724,0.0,...,2.473774,95.666667,0.540334,1365.0,11.767241,12.0,7.0,2017-01-03,2018-04-12,4.708186


In [57]:
test_small_df.isnull().sum()

first_active_month          0
card_id                     0
feature_1                   0
feature_2                   0
feature_3                   0
elapsed_time                0
transactions_count          0
authorized_flag_sum         0
authorized_flag_mean        0
category_1_sum              0
category_1_mean             0
category_2_1.0_sum          0
category_2_1.0_mean         0
category_2_2.0_sum          0
category_2_2.0_mean         0
category_2_3.0_sum          0
category_2_3.0_mean         0
category_2_4.0_sum          0
category_2_4.0_mean         0
category_2_5.0_sum          0
category_2_5.0_mean         0
category_2_6.0_sum          0
category_2_6.0_mean         0
category_3_A_sum            0
category_3_A_mean           0
category_3_B_sum            0
category_3_B_mean           0
category_3_C_sum            0
category_3_C_mean           0
category_3_D_sum            0
                           ..
active_months_lag3_mean     0
active_months_lag3_max      0
active_mon

### Export small dataframes

In [58]:
train_small_df.to_csv('(2)small_train.csv', index = False)
test_small_df.to_csv('(2)small_test.csv', index = False)