In [6]:
import pandas as pd
import numpy as np
# Plot libraries
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from math import sqrt

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [2]:
# Import the preprocessed dataframes
train_df = pd.read_csv("(2)huge_train.csv")
test_df = pd.read_csv("(2)huge_test.csv")
train_df.shape, test_df.shape

((19141554, 18), (11759740, 17))

In [3]:
train_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range,target
0,False,C_ID_5037ff576e,322,False,1,B,278,M_ID_b61c7d1be0,-3,-0.59526,2017-09-07,3.0,11,37,91632.0,D,D,-2.352713
1,True,C_ID_5037ff576e,138,False,1,B,307,M_ID_fe69229f24,-4,1.189469,2017-08-14,1.0,15,19,56893.0,B,C,-2.352713
2,True,C_ID_5037ff576e,138,False,1,B,705,M_ID_efc106141c,-9,-0.640069,2017-03-05,1.0,15,33,21026.0,D,D,-2.352713
3,True,C_ID_5037ff576e,226,False,1,B,307,M_ID_708022307c,-4,-0.652256,2017-08-15,1.0,16,19,16621.0,A,B,-2.352713
4,True,C_ID_5037ff576e,330,False,1,B,705,M_ID_393b4b8cec,-9,-0.67421,2017-03-26,3.0,17,33,35.0,B,B,-2.352713


In [21]:
cat_features = ['authorized_flag','city_id', 'category_1','category_3', 'merchant_category_id', 'merchant_id',
                'category_2', 'state_id', 'subsector_id', 'merchant_group_id', 'most_recent_sales_range', 'most_recent_purchases_range']
cont_features = ['installments', 'month_lag', 'purchase_amount']

In [23]:
# Input:
#    - df: input dataframe
#    - features_list: list of features on which apply tranformation
#    - time_list: time frames considered for each transformation
def categorical_features_eng(df, features_list, time_list):
    for feat_name in features_list:
        for time_name in time_list:
            new_feat_name = feat_name + '_'  + time_name
            df[new_feat_name] = df.groupby(time_name)[feat_name].transform('count')
    return df


def numerical_features_eng(df, features_list, time_list):
    for feat_name in features_list:
        for time_name in time_list:
            df[feat_name + '_' + time_name + '_mean'] = df.groupby(time_name)[feat_name].transform('mean')
            df[feat_name + '_' + time_name + '_max'] = df.groupby(time_name)[feat_name].transform('max')
            df[feat_name + '_' + time_name + '_min'] = df.groupby(time_name)[feat_name].transform('min')
            df[feat_name + '_' + time_name + '_sum'] = df.groupby(time_name)[feat_name].transform('sum')
            df[feat_name + '_' + time_name + '_std'] = df.groupby(time_name)[feat_name].transform('std')
    return df

In [24]:
def feature_engineering(df):
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    
    df['day_of_week'] = df['purchase_date'].dt.dayofweek
    df['day_of_month'] = df['purchase_date'].dt.day
    df['week_of_year'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    
    time_list = ['day_of_week', 'day_of_month', 'week_of_year', 'month']
    categorical_features_eng(df, cat_features, ['purchase_date'])
    numerical_features_eng(df, cont_features, ['purchase_date'])
    
    return df

In [25]:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)
train_df.shape, test_df.shape

((19141554, 49), (11759740, 48))

In [26]:
train_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,month_lag_purchase_date_mean,month_lag_purchase_date_max,month_lag_purchase_date_min,month_lag_purchase_date_sum,month_lag_purchase_date_std,purchase_amount_purchase_date_mean,purchase_amount_purchase_date_max,purchase_amount_purchase_date_min,purchase_amount_purchase_date_sum,purchase_amount_purchase_date_std
0,False,C_ID_5037ff576e,322,False,1,B,278,M_ID_b61c7d1be0,-3,-0.59526,...,-4.23918,2,-5,-186295,1.499793,0.344795,28402.554646,-0.746893,15152.365035,142.898114
1,True,C_ID_5037ff576e,138,False,1,B,307,M_ID_fe69229f24,-4,1.189469,...,-5.0596,2,-6,-209938,1.778255,-0.468474,3324.656366,-0.746893,-19438.380727,16.342966
2,True,C_ID_5037ff576e,138,False,1,B,705,M_ID_efc106141c,-9,-0.640069,...,-8.787795,1,-11,-194553,3.402349,0.082782,11119.097534,-0.746893,1832.715152,76.753427
3,True,C_ID_5037ff576e,226,False,1,B,307,M_ID_708022307c,-4,-0.652256,...,-5.071141,2,-6,-211639,1.761236,-0.211578,13374.780985,-0.746893,-8829.988543,65.689998
4,True,C_ID_5037ff576e,330,False,1,B,705,M_ID_393b4b8cec,-9,-0.67421,...,-8.874847,1,-11,-203376,3.283712,0.1021,13587.446112,-0.746893,2339.723108,92.276168


In [27]:
train_df.isnull().sum()

authorized_flag                              0
card_id                                      0
city_id                                      0
category_1                                   0
installments                                 0
category_3                                   0
merchant_category_id                         0
merchant_id                                  0
month_lag                                    0
purchase_amount                              0
purchase_date                                0
category_2                                   0
state_id                                     0
subsector_id                                 0
merchant_group_id                            0
most_recent_sales_range                      0
most_recent_purchases_range                  0
target                                       0
day_of_week                                  0
day_of_month                                 0
week_of_year                                 0
month        

### Export dataframes

In [28]:
train_df.to_csv('(3)eng_train.csv', index = False)
test_df.to_csv('(3)eng_test.csv', index = False)