In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from math import sqrt

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [19]:
# Import the preprocessed dataframes
train_df = pd.read_csv("(2)huge_train.csv")
test_df = pd.read_csv("(2)huge_test.csv")
train_df.shape, test_df.shape

((19141554, 18), (11759740, 17))

In [20]:
train_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,merchant_group_id,most_recent_sales_range,most_recent_purchases_range,target
0,False,C_ID_5037ff576e,322,False,1,B,278,M_ID_b61c7d1be0,-3,-0.59526,2017-09-07,3.0,11,37,91632.0,D,D,-2.352713
1,True,C_ID_5037ff576e,138,False,1,B,307,M_ID_fe69229f24,-4,1.189469,2017-08-14,1.0,15,19,56893.0,B,C,-2.352713
2,True,C_ID_5037ff576e,138,False,1,B,705,M_ID_efc106141c,-9,-0.640069,2017-03-05,1.0,15,33,21026.0,D,D,-2.352713
3,True,C_ID_5037ff576e,226,False,1,B,307,M_ID_708022307c,-4,-0.652256,2017-08-15,1.0,16,19,16621.0,A,B,-2.352713
4,True,C_ID_5037ff576e,330,False,1,B,705,M_ID_393b4b8cec,-9,-0.67421,2017-03-26,3.0,17,33,35.0,B,B,-2.352713


In [21]:
def time_features_eng(df, features_list, time_list):
    for feat_name in features_list:
        for time_name in time_list:
            new_feat_name = feat_name + "_" + time_name
            df[new_feat_name] = df.groupby(time_name)[feat_name].transform('count')
    return df

In [27]:
cat_features = ['authorized_flag','city_id', 'category_1','category_3', 'merchant_category_id', 'merchant_id',
                'category_2', 'state_id', 'subsector_id', 'merchant_group_id', 'most_recent_sales_range', 'most_recent_purchases_range']
cont_features = ['installments', 'month_lag', 'purchase_amount']

In [28]:
# Input:
#    - df: input dataframe
#    - features_list: list of features on which apply tranformation
#    - time_list: time frames considered for each transformation
def categorical_features_eng(df, features_list, time_list):
    for feat_name in features_list:
        for time_name in time_list:
            new_feat_name = feat_name + '_'  + time_name
            df[new_feat_name] = df.groupby(time_name)[feat_name].transform('count')
    return df


def numerical_features_eng(df, features_list, time_list):
    for feat_name in features_list:
        for time_name in time_list:
            df[feat_name + '_' + time_name + '_mean'] = df.groupby(time_name)[feat_name].transform('mean')
            df[feat_name + '_' + time_name + '_max'] = df.groupby(time_name)[feat_name].transform('max')
            df[feat_name + '_' + time_name + '_min'] = df.groupby(time_name)[feat_name].transform('min')
            df[feat_name + '_' + time_name + '_sum'] = df.groupby(time_name)[feat_name].transform('sum')
            df[feat_name + '_' + time_name + '_std'] = df.groupby(time_name)[feat_name].transform('std')
    return df

In [29]:
def feature_engineering(df):
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    
    df['day_of_week'] = df['purchase_date'].dt.dayofweek
    df['day_of_month'] = df['purchase_date'].dt.day
    df['week_of_year'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    
    time_list = ['day_of_week', 'day_of_month', 'week_of_year', 'month']
    categorical_features_eng(df, cat_features, time_list)
    numerical_features_eng(df, cont_features, time_list)
    
    return df

In [None]:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)
train_df.shape, test_df.shape

In [23]:
train_df = feature_engineering(train_df)

In [24]:
train_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,most_recent_purchases_range,target,day_of_week,day_of_month,week_of_year,month,card_id_day_of_week,card_id_day_of_month,card_id_week_of_year,card_id_month
0,False,C_ID_5037ff576e,322,False,1,B,278,M_ID_b61c7d1be0,-3,-0.59526,...,D,-2.352713,3,7,36,9,2839830,651926,344577,1508609
1,True,C_ID_5037ff576e,138,False,1,B,307,M_ID_fe69229f24,-4,1.189469,...,C,-2.352713,0,14,33,8,2459310,640461,310473,1384770
2,True,C_ID_5037ff576e,138,False,1,B,705,M_ID_efc106141c,-9,-0.640069,...,D,-2.352713,6,5,9,3,2102168,615851,407907,1393471
3,True,C_ID_5037ff576e,226,False,1,B,307,M_ID_708022307c,-4,-0.652256,...,B,-2.352713,1,15,33,8,2614002,642274,310473,1384770
4,True,C_ID_5037ff576e,330,False,1,B,705,M_ID_393b4b8cec,-9,-0.67421,...,B,-2.352713,6,26,12,3,2102168,620866,312542,1393471


In [25]:
train_df.isnull().sum()

authorized_flag                0
card_id                        0
city_id                        0
category_1                     0
installments                   0
category_3                     0
merchant_category_id           0
merchant_id                    0
month_lag                      0
purchase_amount                0
purchase_date                  0
category_2                     0
state_id                       0
subsector_id                   0
merchant_group_id              0
most_recent_sales_range        0
most_recent_purchases_range    0
target                         0
day_of_week                    0
day_of_month                   0
week_of_year                   0
month                          0
card_id_day_of_week            0
card_id_day_of_month           0
card_id_week_of_year           0
card_id_month                  0
dtype: int64