In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from math import sqrt

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Garbage collector
import gc
gc.enable()

In [27]:
# Import the preprocessed dataframes
train_df = pd.read_csv("(2)small_train.csv")
test_df = pd.read_csv("(2)small_test.csv")
train_df.shape, test_df.shape

((193600, 33), (118392, 32))

In [28]:
train_df.columns

Index(['first_active_month', 'card_id', 'feature_1', 'feature_2', 'feature_3',
       'target', 'numerical_1', 'numerical_2', 'installments', 'month_lag',
       'purchase_amount', 'avg_sales_lag3', 'avg_purchases_lag3',
       'active_months_lag3', 'avg_sales_lag6', 'avg_purchases_lag6',
       'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12',
       'active_months_lag12', 'purchase_date', 'authorized_flag', 'city_id',
       'category_1', 'category_3', 'merchant_category_id', 'merchant_id',
       'category_2', 'state_id', 'subsector_id', 'most_recent_sales_range',
       'most_recent_purchases_range', 'category_4'],
      dtype='object')

In [29]:
def time_features_eng(df, features_list, time_list):
    for feat_name in features_list:
        for time_name in time_list:
            new_feat_name = feat_name + "_" + time_name
            df[new_feat_name] = df.groupby(time_name)[feat_name].transform('count')
    return df

In [30]:
cat_features = ['first_active_month','authorized_flag','city_id', 'category_1','category_3', 'merchant_category_id', 'merchant_id',
                'category_2', 'state_id', 'subsector_id', 'most_recent_sales_range', 'most_recent_purchases_range', 'category_4', 'feature_1', 'feature_2', 'feature_3']
cont_features = ['numerical_1', 'numerical_2','installments', 'month_lag', 'purchase_amount', 'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12']

In [31]:
# Input:
#    - df: input dataframe
#    - features_list: list of features on which apply tranformation
#    - time_list: time frames considered for each transformation
def categorical_features_eng(df, features_list, time_list):
    for feat_name in features_list:
        for time_name in time_list:
            new_feat_name = feat_name + '_'  + time_name
            df[new_feat_name] = df.groupby(time_name)[feat_name].transform('count')
    return df


def numerical_features_eng(df, features_list, time_list):
    for feat_name in features_list:
        for time_name in time_list:
            df[feat_name + '_' + time_name + '_mean'] = df.groupby(time_name)[feat_name].transform('mean')
            df[feat_name + '_' + time_name + '_max'] = df.groupby(time_name)[feat_name].transform('max')
            df[feat_name + '_' + time_name + '_min'] = df.groupby(time_name)[feat_name].transform('min')
            #df[feat_name + '_' + time_name + '_sum'] = df.groupby(time_name)[feat_name].transform('sum')
            df[feat_name + '_' + time_name + '_std'] = df.groupby(time_name)[feat_name].transform('std')
    return df

In [34]:
def feature_engineering(df):
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    
    #df['day_of_week'] = df['purchase_date'].dt.dayofweek
    #df['day_of_month'] = df['purchase_date'].dt.day
    #df['week_of_year'] = df['purchase_date'].dt.weekofyear
    #df['month'] = df['purchase_date'].dt.month
    
    #time_list = ['day_of_week', 'day_of_month', 'week_of_year', 'month']
    time_list = ['purchase_date']
    categorical_features_eng(df, cat_features, time_list)
    numerical_features_eng(df, cont_features, time_list)
    
    return df

In [35]:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)
train_df.shape, test_df.shape

((193600, 105), (118392, 104))

In [36]:
train_df = feature_engineering(train_df)

In [37]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,numerical_1,numerical_2,installments,month_lag,...,avg_sales_lag12_purchase_date_min,avg_sales_lag12_purchase_date_std,avg_purchases_lag12_purchase_date_mean,avg_purchases_lag12_purchase_date_max,avg_purchases_lag12_purchase_date_min,avg_purchases_lag12_purchase_date_std,active_months_lag12_purchase_date_mean,active_months_lag12_purchase_date_max,active_months_lag12_purchase_date_min,active_months_lag12_purchase_date_std
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,18.290382,17.688792,0.01444,-3.530686,...,0.47,536.232012,2.954176,1288.646787,0.414029,39.419159,11.893529,12.0,8.0,0.32338
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,30.047451,29.385934,1.542135,-4.921348,...,0.56,5705.408491,32.784191,6277.899296,0.584,415.464598,11.932188,12.0,10.0,0.202547
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,0.814297,0.637316,0.0,-8.363636,...,0.592857,6112.868281,54.154606,4566.294099,0.517669,445.121397,11.953395,12.0,10.852459,0.108952
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,4.787462,4.687412,1.059524,-2.452381,...,0.56,1346.594948,6.506829,2283.493215,0.583477,98.417338,11.911873,12.0,9.0,0.261902
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,3.765944,3.705154,1.285714,-0.678571,...,0.57625,212.399559,2.210498,265.705909,0.605442,15.553186,11.909885,12.0,9.714286,0.208435


In [38]:
train_df.isnull().sum()

first_active_month                        0
card_id                                   0
feature_1                                 0
feature_2                                 0
feature_3                                 0
target                                    0
numerical_1                               0
numerical_2                               0
installments                              0
month_lag                                 0
purchase_amount                           0
avg_sales_lag3                            0
avg_purchases_lag3                        0
active_months_lag3                        0
avg_sales_lag6                            0
avg_purchases_lag6                        0
active_months_lag6                        0
avg_sales_lag12                           0
avg_purchases_lag12                       0
active_months_lag12                       0
purchase_date                             0
authorized_flag                           0
city_id                         

In [39]:
train_df.to_csv("(3)small_train_eng.csv", index=False)

In [40]:
test_df.to_csv("(3)small_test_eng.csv", index=False)