# Elo Merchant Category Recommender
https://www.kaggle.com/c/elo-merchant-category-recommendation

## Import ML Libraries

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [49]:
%matplotlib inline

## Import ML Algorithms

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import StandardScaler
from sklearn.metrics         import mean_squared_error
from scipy                   import stats
from sklearn.linear_model    import LinearRegression 

## Import Other Libraries

In [51]:
import os
import math
import statistics

## Set Styles of Libraries

#### Seaborn

In [52]:
sns.set_style( 'darkgrid' )

#### Pandas

In [53]:
pd.set_option( 'display.max_columns', None )

## Load Data & Explore Data

In [54]:
data_files_path = os.path.join( os.getcwd() + '', '..', '..', '..', 'data', 'elo_merchant', '' )
data_files      = os.listdir( data_files_path )
df              = []

In [55]:
for csv_file in data_files:
    if csv_file != '.ipynb_checkpoints':
        df.append( pd.read_csv( '../../../data/elo_merchant/' + csv_file ) )

In [56]:
print( data_files )

['historical_transactions.csv', 'merchants.csv', 'new_merchant_transactions.csv', 'test.csv', 'train.csv']


###### Historical Transactions

In [57]:
df[0].head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [58]:
df[0].shape

(29112361, 14)

###### Merchants

In [59]:
df[1].head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,N,E,E,-0.4,9.666667,3,-2.25,18.666667,6,-2.32,13.916667,12,N,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,N,E,E,-0.72,1.75,3,-0.74,1.291667,6,-0.57,1.6875,12,N,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,N,E,E,-82.13,260.0,2,-82.13,260.0,2,-82.13,260.0,2,N,-1,5,5.0
3,M_ID_a70e9c5f81,5026,792,9,-0.057471,-0.057471,Y,E,E,,1.666667,3,,4.666667,6,,3.833333,12,Y,-1,-1,
4,M_ID_64456c37ce,2228,222,21,-0.057471,-0.057471,Y,E,E,,0.5,3,,0.361111,6,,0.347222,12,Y,-1,-1,


In [60]:
df[1].shape

(334696, 22)

###### New Merchant Transactions

In [61]:
df[2].head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [62]:
df[2].shape

(1963031, 14)

###### Test

In [63]:
df[3].head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [64]:
df[3].shape

(123623, 5)

###### Train

In [65]:
df[4].head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [66]:
df[4].shape

(201917, 6)

In [67]:
df[4].shape[0] / ( df[3].shape[0] + df[4].shape[0] )

0.620252503532592

## Linear Regression Prediction Trial

In [42]:
X = df[4].drop( ['first_active_month', 'card_id', 'target'], axis = 1 )
y = df[4][ 'target' ]

In [43]:
lm = LinearRegression()
lm.fit( X, y )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [45]:
predictions = lm.predict( df[3].drop( ['first_active_month', 'card_id'], axis = 1 ) )

In [47]:
predictions[1:20]

array([-0.39810698, -0.4559887 , -0.31061005, -0.4559887 , -0.4559887 ,
       -0.30057739, -0.34842645, -0.50566923, -0.4559887 , -0.49973716,
       -0.39217492, -0.35435852, -0.4181723 , -0.4559887 , -0.39810698,
       -0.46192076, -0.50566923, -0.43592338, -0.31061005])

## Get Root Mean Squared Error (RMSE)

In [None]:
math.sqrt( mean_squared_error( y_test, y_predict ) )