# Black Friday Sales Prediction

## Data Exploration

In [13]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [14]:
from sklearn import preprocessing
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [15]:
#read in data
df = pd.read_csv('train.csv')
df.shape

(550068, 12)

In [16]:
df_new = df.copy()

In [7]:
df.Product_Category_1 = df.Product_Category_1.astype(float)
t1 = pd.get_dummies(df['Product_Category_1'], drop_first = True)
t2 = pd.get_dummies(df['Product_Category_2'])
t3 = pd.get_dummies(df['Product_Category_3'])
t = t1.add(t2,fill_value=0)   # same for two steps t= t1.add(t2, fill_value=0) and t.add(t3, fill_value=0)
t = t.add(t3,fill_value=0).astype(int)



In [51]:
df_new.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
df_test = pd.read_csv('test.csv')

In [5]:
pd.DataFrame({'number_nan':df_test.isna().sum(), # [df_test.isna().sum()][0], here [0] coerces list into series with 12 separate values
            'number_distinct':[df_test[col].nunique() for col in df_test.columns],
            'distinct vals': [df_test[col].unique() for col in df_test.columns]})

Unnamed: 0,number_nan,number_distinct,distinct vals
User_ID,0,5891,"[1000004, 1000009, 1000010, 1000011, 1000013, ..."
Product_ID,0,3491,"[P00128942, P00113442, P00288442, P00145342, P..."
Gender,0,2,"[M, F]"
Age,0,7,"[46-50, 26-35, 36-45, 18-25, 51-55, 55+, 0-17]"
Occupation,0,21,"[7, 17, 1, 15, 3, 0, 8, 16, 4, 12, 13, 18, 11,..."
City_Category,0,3,"[B, C, A]"
Stay_In_Current_City_Years,0,5,"[2, 0, 4+, 1, 3]"
Marital_Status,0,2,"[1, 0]"
Product_Category_1,0,18,"[1, 3, 5, 4, 2, 10, 15, 18, 8, 13, 6, 11, 12, ..."
Product_Category_2,72344,17,"[11.0, 5.0, 14.0, 9.0, 3.0, 4.0, 13.0, 2.0, na..."


In [6]:
df_test.User_ID.unique().all()==df.User_ID.unique().all()

True

The User ID are the same for train data and test data

In [6]:
#see all rows with missing values
df[df.isnull().any(axis=1)]

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871
10,1000005,P00251242,M,26-35,20,A,1,1,5,11.0,,5254
11,1000005,P00014542,M,26-35,20,A,1,1,8,,,3957


In [7]:
# summary of each column (missing value and distinct value)
df.describe()
nunique = dict(zip(df.columns, [df[col].nunique() for col in df.columns]))
summary = pd.DataFrame({'number_nan':df.isna().sum(), # [df.isna().sum()][0], here [0] coerces list into series with 12 separate values
                        'number_distinct':[df[col].nunique() for col in df.columns],
                       'distinct vals': [df[col].unique() for col in df.columns]})
summary
#only Product_Category_2 and Product_Category_3 have missing values 

Unnamed: 0,number_nan,number_distinct,distinct vals
User_ID,0,5891,"[1000001, 1000002, 1000003, 1000004, 1000005, ..."
Product_ID,0,3631,"[P00069042, P00248942, P00087842, P00085442, P..."
Gender,0,2,"[F, M]"
Age,0,7,"[0-17, 55+, 26-35, 46-50, 51-55, 36-45, 18-25]"
Occupation,0,21,"[10, 16, 15, 7, 20, 9, 1, 12, 17, 0, 3, 4, 11,..."
City_Category,0,3,"[A, C, B]"
Stay_In_Current_City_Years,0,5,"[2, 4+, 3, 1, 0]"
Marital_Status,0,2,"[0, 1]"
Product_Category_1,0,20,"[3, 1, 12, 8, 5, 4, 2, 6, 14, 11, 13, 15, 7, 1..."
Product_Category_2,173638,17,"[nan, 6.0, 14.0, 2.0, 8.0, 15.0, 16.0, 11.0, 5..."


In [8]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [9]:
#check the distinct values in Product_Category_2, 3
print(sorted(df['Product_Category_1'].unique()))
print(sorted(df['Product_Category_2'].unique()))
print(sorted(df['Product_Category_3'].unique()))
# we can label nan in 2 and 3 with 0

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0]
[nan, 3.0, 4.0, 5.0, 6.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0]


- number of distinct values for user_id != total number of user_id
    - multiple transaction records for one individual

Predicting purchase amount and product category.

### Mutual information plot on all features

In [None]:
from sklearn.metrics import normalized_mutual_info_score
ml = []
for col in df_1.columns[2:11]:
    ml.append(normalized_mutual_info_score(df[col], df_1.Purchase))
df_ml = pd.DataFrame(ml, index =df_1.columns[2:11],columns = ['MI with Purchase'])
df_ml.sort_values(by= ['MI with Purchase']).plot.bar()

### Feature Engineering

In [17]:
def train_test(data, test_size):
    test_index = np.random.choice(data.index, size=int(data.shape[0]*test_size), replace=False)
    train_index =  list(set(data.index) - set(test_index))
    
    train = data.iloc[train_index]
    test = data.iloc[test_index]
    return(train,test)

In [18]:
train, test = train_test(df_new,0.2)

In [72]:
df.User_ID.apply(str).str.startswith('100').all()

True

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
10,1000005,P00251242,M,26-35,20,A,1,1,5,11.0,,5254
12,1000005,P00031342,M,26-35,20,A,1,1,8,,,6073
14,1000006,P00231342,F,51-55,9,A,1,0,5,8.0,14.0,5378


In [23]:
# count of each product purchased
occurence_product_id = dict(train.Product_ID.value_counts())
def occurence_p_id(ID):
    if ID in occurence_product_id:
        return occurence_product_id[ID]
    else:
        return 0 # if not exist in d_count, fill with 0

# average value amount of that product
temp = train.groupby('Product_ID')['Purchase'].mean()
d_purchase = dict(zip(temp.index.tolist(), temp.values.tolist())) # dictionary of 'Product_ID': count
def amount_id(ID):
    if ID in d_purchase:
        return d_purchase[ID]
    else:
        return np.mean(list(d_purchase.values())) # if not exist in d_purchase, fill with mean value


def feature_engineering(dataframe):
    '''
    Some features don't come in handy, so we need make some adjustments on those features
    '''
    df = dataframe.copy()
    # Product_ID
    ## We need two other meaningful features to represent Product_ID
    ### Product_count and Product_mean of each Product_ID 
    df['Purchase_Count'] = df['Product_ID'].apply(occurence_p_id)
    df['Product_Mean'] = df['Product_ID'].apply(amount_id)
    
    
    # dummy variable the nominal variable
    genders = pd.get_dummies(df['Gender'], drop_first = True, prefix = 'Gender')
    df = pd.merge(df, genders, left_index=True, right_index = True)

    occupations = pd.get_dummies(df['Occupation'], drop_first = True, prefix = 'Occupation')
    df = pd.merge(df, occupations, left_index=True, right_index = True)

    cities = pd.get_dummies(df['City_Category'], drop_first = True, prefix = 'City_Category')
    df = pd.merge(df, cities, left_index=True, right_index = True)
    
    
    # replace age and years_in_current_city into numerical value
    age = sorted(df['Age'].unique())
    for i in range(len(age)):
        df['Age'] = df['Age'].replace(age[i],i)
    
    years_in_current_city = sorted(df['Stay_In_Current_City_Years'].unique())
    for i in range(len(years_in_current_city)):
        df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].replace(years_in_current_city[i],i)
    
    
    # change Product_Category_1, _2, _3 into 19 features with one product possibly fitting into 2 or 3 categories
    df.Product_Category_1 = df.Product_Category_1.astype(float)
    t1 = pd.get_dummies(df['Product_Category_1'], drop_first = True)
    t2 = pd.get_dummies(df['Product_Category_2'])
    t3 = pd.get_dummies(df['Product_Category_3'])
    t = t1.add(t2,fill_value=0)   # same for two steps t= t1.add(t2, fill_value=0) and t.add(t3, fill_value=0)
    t = t.add(t3,fill_value=0).astype(int)
    
    pc = ['Product_Category' + str(i) for i in range(2,21)]
    cols = dict(zip(np.array(range(2,21)).astype(float), pc))
    t = t.rename(columns=cols)
    df = pd.merge(df, t,left_index=True, right_index = True)
    
    
    # Drop extra features
    df = df.drop(['User_ID', 'Product_ID', 'Gender', 'Occupation', 'City_Category',
                'Product_Category_1', 'Product_Category_2', 'Product_Category_3'],axis=1)
    
    return df
    

In [24]:
dff= feature_engineering(train)
dff

Unnamed: 0,Age,Stay_In_Current_City_Years,Marital_Status,Purchase,Purchase_Count,Product_Mean,Gender_M,Occupation_1,Occupation_2,Occupation_3,...,Product_Category11,Product_Category12,Product_Category13,Product_Category14,Product_Category15,Product_Category16,Product_Category17,Product_Category18,Product_Category19,Product_Category20
0,0,2,0,8370,188,11686.345745,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2,0,15200,461,16299.238612,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,2,0,1422,78,1203.500000,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,6,4,0,7969,158,7711.449367,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,3,0,15227,483,14400.358178,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,4,2,1,19215,1159,16850.056946,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,4,2,1,15854,447,14237.543624,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
10,2,1,1,5254,948,6839.424051,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12,2,1,1,6073,27,7774.629630,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,5,1,0,5378,71,6108.408451,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [25]:
X_train = dff.drop('Purchase',1)
y_train = dff['Purchase']

MI from Category 2 and Category 3 are no reliable because they have so many missing values

-the correlation is low, so we want to dummy variable the age

There's not much mutual information either, so we might want to keep the hard-coded values?

- plot pairwise labels and target
- create dummies for `occupation`, `marital_status`, etc.

- cluster the categories 1, 2, 3  
    - some products are complements, so we can build clustering to group products with similarities in order to reduce dimension.

In [25]:
df_1.pivot_table(index = 'Gender', columns = 'Age')['Purchase']

Age,0,1,2,3,4,5,6
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,8338.771985,8343.180201,8728.251754,8959.844056,8842.098947,9042.449666,9007.036199
1,9235.17367,9440.942971,9410.337578,9453.193643,9357.471509,9705.094802,9438.195603


Male costomers purchase more values of items at all age level. Can this be important in our model

In [26]:
df_test.Product_ID.unique().all()==df.Product_ID.unique().all()

False

In [27]:
#df_1['Product_Count'] = df_1.groupby('Product_ID')['Product_ID'].transform('count')

In [28]:
#df_1['total_purchase_product'] = df_1.groupby('Product_Count')['Purchase'].transform('sum')

In [29]:
#df_1['Product_Mean'] = df_1['total_purchase_product']/df_1['Product_Count']

In [31]:
#df_1 = df_1.drop('total_purchase_product',axis=1)


In [42]:
def train_model(model, X_train,  y_train,  param_grid={}, splits=4):

    '''
    Modeling and evaluation function

    args:
        - model: regressor object, e.g. DecisionTreeRegressor()
        - param_grid: a dictionary with possible parameters choices
        - splits: number of k-folds splits
    ''' 
    
    # create cross-validation method
    kfolds = KFold(n_splits = splits)

    ## perform cross-validation
    # setup grid search parameters
    gsearch = GridSearchCV(model, param_grid, cv=kfolds,
                           scoring='neg_mean_squared_error', return_train_score=True)

    # search the grid
    gsearch.fit(X_train,y_train)

    # extract best model from the grid
    best_model = gsearch.best_estimator_  
    best_score = gsearch.best_score_
    
    # print stats on model performance         
    print('----------------------')
    print(best_model)
    print('----------------------')
    print('rmse=',math.sqrt(-best_score))
    
    return best_model, best_score

In [48]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor()

param_grid = {'hidden_layer_sizes':[(100,200),(100,200,10),(100,200,50,10),(100,100,200,20,10)]
#param_grid = {'alpha': alph_range}

model, best_score = train_model(model, X_train, y_train, splits=4,param_grid=param_grid)

----------------------
MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
----------------------
rmse= 2646.287903331001


NameError: name 'opt_models' is not defined

## Model

In [46]:
def train_test(label, data, test_size):
    features = list(filter(lambda x: x != label, data.columns.tolist()))
    feature_space = data[features]
    
    X_train, X_test, y_train, y_test = train_test_split(feature_space, list(data[label]), test_size=test_size, random_state=22)
    return X_train, X_test, y_train, y_test

In [28]:
def rmse(y_test, y_pred):
    '''
    Calculate rmse function
    '''
    return math.sqrt(np.mean((y_pred - y_test) ** 2))

In [65]:
def train_model(model, X_train, y_train, param_grid={}, splits=4):

    '''
    Modeling and evaluation function

    args:
        - model: regressor object, e.g. DecisionTreeRegressor()
        - param_grid: a dictionary with possible parameters choices
        - splits: number of k-folds splits
    '''    
    # create cross-validation method
    kfolds = KFold(n_splits = splits)

    ## perform cross-validation
    # setup grid search parameters
    gsearch = GridSearchCV(model, param_grid, cv=kfolds,
                           scoring='neg_mean_squared_error', return_train_score=True, n_jobs= -1)
    print('set up')
    # search the grid
    gsearch.fit(X_train,y_train)
    print('Finish fitting')
    
    # extract best model from the grid
    model = gsearch.best_estimator_        
    best_index = gsearch.best_index_
    
    # get cv-scores for best model
    grid_results = pd.DataFrame(gsearch.cv_results_)       
    cv_mean = abs(grid_results.loc[best_index,'mean_test_score'])
    cv_std = grid_results.loc[best_index,'std_test_score']
    
    # combine mean and std cv-score in to a pandas series
    cv_score = pd.Series({'mean':cv_mean,'std':cv_std})

    # predict y_train using the fitted model
    #y_pred = model.predict(X_test)

    # print stats on model performance         
    print('----------------------')
    print(model)
    print('----------------------')
    print('score=',model.score(X_train,y_train))
    #print('rmse=',rmse(y_test, y_pred))
    print('cross_val: mean=',cv_mean,', std=',cv_std)

    return model, cv_score, grid_results

In [71]:
# places to store optimal models and scores
opt_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])

# no. k-fold splits
splits=10

In [47]:
X_train, X_test, y_train, y_test = train_test('Purchase', df_new, test_size = 0.2)

### Neural Network

In [29]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [38]:
def create_model(neurons=1): #, activation ='relu', neurons = 1, momentum=0): #optimizer = 'adam', 
    # create model
    model = Sequential()
    model.add(Dense(neurons, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))
#     model.add(Dense(256, kernel_initializer='normal',activation=activation))
#     model.add(Dense(256, kernel_initializer='uniform',activation='relu'))
    model.add(Dense(1, kernel_initializer = 'uniform', activation='relu'))
#     model.add(Dense(4, kernel_initializer = 'uniform',activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='relu'))
    # Compile model
    #optimizer = SGD(lr=learn_rate)#, momentum=momentum)
    model.compile(loss='mean_squared_error', optimizer = 'adam', metrics=["mean_squared_error"])
    return model

# fix random seed for reproducibility
# seed = 7
# numpy.random.seed(seed)

# load dataset
# dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# split into input (X) and output (Y) variables
# X = dataset[:,0:8]
# Y = dataset[:,8]

# checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
# checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
# callbacks_list = [checkpoint]

# create model
model = KerasRegressor(build_fn=create_model, batch_size = 32, epochs = 1, verbose=0)

# define the grid search parameters
#optimizer = ['SGD', 'Adam', 'Adamax', 'Nadam']
learn_rate = [0.001, 0.05]#, 0.01, 0.1, 0.2]
#activation = ['relu', 'tanh']#, 'sigmoid', 'linear']  #softmax
neurons = [1, 5]#, 10, 20,40,50]
momentum = [0.0, 0.2, 0.4, 0.6]
param_grid = dict(neurons = neurons)#learn_rate = learn_rate)#, activation = activation,neurons = neurons, momentum=momentum) # optimizer=optimizer,, activation = activation


kfolds = KFold(n_splits = 4)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1,cv=kfolds)
grid_result = grid.fit(X_train, y_train)

KeyboardInterrupt: 

In [35]:
#opt_models[model], cv_score, grid_results = train_model(opt_models[model], X_train, y_train, splits=splits,param_grid=param_grid)

#summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -111185514.634803 using {'learn_rate': 0.001}
-111185514.634803 (1253678.108712) with: {'learn_rate': 0.001}
nan (nan) with: {'learn_rate': 0.05}


In [31]:
def create_model(learn_rate = 0.001, activation ='relu', neurons = 1, momentum=0): #optimizer = 'adam', 
    # create model
    model = Sequential()
    model.add(Dense(128, kernel_initializer='normal',input_dim = df_new.shape[1]-1, activation='relu'))
    model.add(Dense(256, kernel_initializer='normal',activation=activation))
    model.add(Dense(256, kernel_initializer='uniform',activation='relu'))
    model.add(Dense(neurons, kernel_initializer = 'uniform', activation='relu'))
    model.add(Dense(4, kernel_initializer = 'uniform',activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation=activation))
    # Compile model
    optimizer = SGD(lr=learn_rate, momentum=momentum)
    model.compile(loss='mean_squared_error', optimizer = optimizer, metrics=["accuracy"])
    return model

# fix random seed for reproducibility
# seed = 7
# numpy.random.seed(seed)

# load dataset
# dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# split into input (X) and output (Y) variables
# X = dataset[:,0:8]
# Y = dataset[:,8]

# checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
# checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
# callbacks_list = [checkpoint]

# create model
model = KerasRegressor(build_fn=create_model, batch_size = 32, epochs = 10, verbose=0)

# define the grid search parameters
#optimizer = ['SGD', 'Adam', 'Adamax', 'Nadam']
learn_rate = [0.001, 0.05, 0.01, 0.1, 0.2]
activation = ['relu', 'tanh', 'sigmoid', 'linear']  #softmax
neurons = [1, 5, 10, 20,40,50]
momentum = [0.0, 0.2, 0.4, 0.6]
param_grid = dict(activation = activation, learn_rate = learn_rate, neurons = neurons, momentum=momentum) # optimizer=optimizer,, activation = activation


kfolds = KFold(n_splits = 4)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1,cv=kfolds)
grid_result = grid.fit(X_train, y_train)


#opt_models[model], cv_score, grid_results = train_model(opt_models[model], X_train, y_train, splits=splits,param_grid=param_grid)

# summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

ValueError: Error when checking input: expected dense_1_input to have shape (11,) but got array with shape (47,)

In [42]:
grid_result = grid.fit(X_train,y_train)

KeyboardInterrupt: 

### Linear Regression

In [67]:
feature_engineering(test)

Unnamed: 0,Age,Stay_In_Current_City_Years,Marital_Status,Purchase,Purchase_Count,Product_Mean,Gender_M,Occupation_1,Occupation_2,Occupation_3,...,Product_Category11,Product_Category12,Product_Category13,Product_Category14,Product_Category15,Product_Category16,Product_Category17,Product_Category18,Product_Category19,Product_Category20
384760,5,4,0,19046,1091,16365.318973,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
384786,1,1,0,16159,449,12114.443207,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
280407,5,4,0,8575,90,7895.900000,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
434029,2,2,1,7045,215,6403.762791,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
487649,2,4,0,12187,27,10484.037037,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
470332,5,0,0,11476,198,12683.555556,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7281,1,1,0,7880,591,14240.411168,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
118571,1,4,0,15179,856,15797.286215,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
311803,0,3,0,3092,210,4649.557143,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
531574,3,4,0,4186,227,10984.022026,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [80]:
model = 'Linear Regression'

opt_models[model] = linear_model.LinearRegression()
opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid={}, splits=splits)

finish search for the best model with hyperparameters
----------------------
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
----------------------
score= 0.7235511622599653
rmse= 2644.4912784713147
cross_val: mean= 6981536.986509137 , std= 37713.65164215539


### Tree

In [42]:
from sklearn.tree import DecisionTreeRegressor
model = 'tree'

opt_models[model] = DecisionTreeRegressor()
param_grid = {'max_depth':range(3,20)}
#param_grid = {'alpha': alph_range}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], splits=splits,param_grid=param_grid)

----------------------
DecisionTreeRegressor(criterion='mse', max_depth=8, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
----------------------
score= 0.7245802957128267
rmse= 2649.1684442817827
cross_val: mean= 7001778.970348449 , std= 39400.78050440891


In [44]:
from sklearn.ensemble import RandomForestRegressor
model = 'random forest'
opt_models[model]= RandomForestRegressor()
param_grid = {'max_depth':range(3,20)}
#param_grid = {'alpha': alph_range}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], splits=splits,param_grid=param_grid)

KeyboardInterrupt: 

In [11]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
dff = df_new.copy()

In [12]:
NN_model = Sequential()
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = dff.shape[1]-1, activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

In [13]:
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

In [14]:
NN_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
NN_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               6144      
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 171,009
Trainable params: 171,009
Non-trainable params: 0
_________________________________________________________________


In [15]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [22]:
NN_model.fit(X_tr, y_tr, epochs=10, batch_size=32, validation_split = 0.1, callbacks=callbacks_list)

Train on 445554 samples, validate on 49507 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 7001749.53785, saving model to Weights-001--7001749.53785.hdf5
Epoch 2/10

Epoch 00002: val_loss did not improve from 7001749.53785
Epoch 3/10

Epoch 00003: val_loss did not improve from 7001749.53785
Epoch 4/10

Epoch 00004: val_loss did not improve from 7001749.53785
Epoch 5/10

Epoch 00005: val_loss did not improve from 7001749.53785
Epoch 6/10

Epoch 00006: val_loss did not improve from 7001749.53785
Epoch 7/10

Epoch 00007: val_loss improved from 7001749.53785 to 7001734.58320, saving model to Weights-007--7001734.58320.hdf5
Epoch 8/10

Epoch 00008: val_loss improved from 7001734.58320 to 6989826.03208, saving model to Weights-008--6989826.03208.hdf5
Epoch 9/10

Epoch 00009: val_loss did not improve from 6989826.03208
Epoch 10/10

Epoch 00010: val_loss did not improve from 6989826.03208


<keras.callbacks.History at 0x1a15b98e48>

In [25]:
predictions = NN_model.predict(X_te)

In [47]:
yreal = np.array(y_te)

In [66]:
rmse(pre, yreal)

2675.3395597813046

In [65]:
pre 

array([ 8439.92  , 14912.123 ,  2832.109 , ..., 16106.003 ,  4618.2524,
        8043.238 ], dtype=float32)

In [64]:
pre = predictions.flatten()

In [18]:
a = dff[list(filter(lambda x: x != 'Purchase', dff.columns.tolist()))]
b = dff['Purchase']

In [19]:
X_tr, X_te, y_tr, y_te = train_test_split(a, b, test_size=0.1, random_state=22)

In [21]:
y_tr

39445      4503
390508     7006
55613     16083
424856     2138
247795      584
241292     5303
469351     9802
432689     6066
439896     9849
523647     5360
96921      7082
318560     8656
268124    23153
520371    20281
254548     5198
172817     3438
75401      3041
405506    18971
186529    13907
55991      9796
245667     9840
491222     8002
419592     5423
288049     8218
298975     8823
108219     4674
69283     16351
263661     5372
53034      8651
519236     9626
          ...  
50735     12270
338279    15670
517748    19073
50228      2322
262541      201
279179     5833
92947      8905
144275     7135
237177     7045
294182     5956
244817     9822
305874     7829
80413      5903
160418     1373
15727      6905
441000     8699
23560     11741
365858     8284
123815     3541
529934    11394
79506     19586
107485    11589
125832    12422
81492      6672
20971      6992
219638     4102
120166     9760
549220       24
162752     2778
252036    15674
Name: Purchase, Length: 

### xgboost

In [None]:
model = 'XGB'
opt_models[model] = XGBRegressor()

# Fix learning rate and number of estimators for tuning tree-based parameters
param_fixed = {'learning_rate' =0.1,
               'n_estimators'=1000,
               'max_depth'=5,
               'min_child_weight'=1,
               'gamma'=0,
               'subsample'=0.8,
               'colsample_bytree'=0.8,
               'objective'= 'binary:logistic',
               'nthread'=4,
               'scale_pos_weight'=1,
               'seed'=27}

# Tune max_depth and min_child_weight
param_test1 = {'max_depth':range(3,10,2),
               'min_child_weight':range(1,6,2)}

# Tune gamma
param_test2 = {'gamma':[i/10.0 for i in range(0,5)]}

# Tune subsample and colsample_bytree
param_test3 = {'subsample':[i/10.0 for i in range(6,10)],
               'colsample_bytree':[i/10.0 for i in range(6,10)]}

# Tune regularization
param_test4 = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}

# Reducing Learning Rate
# learning_rate =0.01

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid={}, splits=splits)

cv_score.name = model
score_models = score_models.append(cv_score)