In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score

sns.set_style("darkgrid")

# increase plot resolution
mpl.rcParams["figure.dpi"] = 150

In [2]:
rank = pd.read_csv('Gymboree_PM_Location_by_Sales.csv')
df = pd.read_csv('15_Minute_DriveTime_Area_Shopping_Parking_Education.csv')
split = pd.read_csv('Gymboree_Store_Split.csv')
split.dropna(axis=0,how='all',inplace=True)
dis15 = pd.read_csv('Gymboree_Store_Locations_15Mile_IndependVariables_TableToExcel.csv')
dis15 = dis15[['Postal', 
               'child_care_index_mean', 
               'Children_Population_sum', 
               'Bought_ChildrenToy_Game_index_mean', 
               'Bought_ChildrenBook_index_mean']]

In [3]:
rank = rank[['Zip Code', '2018 Rank', '2019 Rank']]

In [4]:
split = split[['Open', 'Postal']]
split.loc[:,'Postal'] = split.loc[:,'Postal'].astype('int')
split.loc[:,'Postal'] = split.loc[:,'Postal'].astype('str')
for i in range(len(split.loc[:,'Postal'])):
    if len(split.loc[i,'Postal']) < 5:
        split.loc[i,'Postal'] = '0' * (5-len(split.loc[i,'Postal'])) + split.loc[i,'Postal']
    else:
        pass

In [5]:
df.loc[:,'Postal'] = df.loc[:,'Postal'].astype('int')
df.loc[:,'Postal'] = df.loc[:,'Postal'].astype('str')
for i in range(len(df.loc[:,'Postal'])):
    if len(df.loc[i,'Postal']) < 5:
        df.loc[i,'Postal'] = '0' * (5-len(df.loc[i,'Postal'])) + df.loc[i,'Postal']
    else:
        pass

In [6]:
dis15.loc[:,'Postal'] = dis15.loc[:,'Postal'].astype('int')
dis15.loc[:,'Postal'] = dis15.loc[:,'Postal'].astype('str')
for i in range(len(dis15.loc[:,'Postal'])):
    if len(dis15.loc[i,'Postal']) < 5:
        dis15.loc[i,'Postal'] = '0' * (5-len(dis15.loc[i,'Postal'])) + dis15.loc[i,'Postal']
    else:
        pass

In [7]:
y = rank.merge(split, how = 'right', left_on='Zip Code', right_on='Postal')
y.drop(columns=['Zip Code'], inplace=True)

In [8]:
df = df.merge(y, how = 'left', on='Postal')
df = df.merge(dis15, how = 'left', on='Postal')

In [9]:
df_model = df[['2018 Rank',
       '2019 Rank', 'Open', 'sum_populationtotals_totpop_cy',
       'mean_populationtotals_totpop_cy', 'sum_householdincome_medhinc_cy',
       'mean_householdincome_medhinc_cy', 'sum_homevalue_avgval_cy',
       'mean_homevalue_avgval_cy', 'sum_householdtotals_avghhsz_cy',
       'mean_householdtotals_avghhsz_cy', 'mean_housinghousehold䀃x_i',
       'mean_babyproductstoysgames_mp34001a_b_i',
       'mean_leisureactivitieslifestyle_mp05010a_b_i',
       'sum_agedependency_child_cy', 'sum_agedependency_child_fy',
       'mean_employmentunemployment_unemprt_cy_i', 'mean_health聕x_i',
       'mean_households_acsapovmcf_p', 'mean_populationtotals_popgrwcyfy_i',
       'mean_householdtotals_hhgrwcyfy_i', 'mean_householdincome_mhigrwcyfy_i',
       'School_Count', 'Parking_Count', 'Shopping_Count', 'child_care_index_mean', 'Children_Population_sum',
       'Bought_ChildrenToy_Game_index_mean', 'Bought_ChildrenBook_index_mean']]

In [10]:
for i in range(len(df_model)):
    if pd.isna(df_model.loc[i,'2018 Rank']):
        df_model.loc[i,'2018 Rank'] = df_model.loc[i,'Open']
    else:
        pass
for i in range(len(df_model)):
    if pd.isna(df_model.loc[i,'2019 Rank']):
        df_model.loc[i,'2019 Rank'] = df_model.loc[i,'Open']
    else:
        pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [11]:
df_model['Success'] = np.nan
for i in range(len(df_model)):
    if df_model.loc[i, '2019 Rank'] == 'Top (1st) 20%' or df_model.loc[i, '2019 Rank'] == '2nd 20%' or df_model.loc[i, '2019 Rank'] == '3rd 20%':
        df_model.loc[i, 'Success'] = 1
    else:
        df_model.loc[i, 'Success'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['Success'] = np.nan


In [12]:
# standardize the numeric variables in intuit75k
# list all numeric variables in the data
is_num = ['sum_populationtotals_totpop_cy',
          'mean_populationtotals_totpop_cy', 
          'sum_householdincome_medhinc_cy',
          'mean_householdincome_medhinc_cy', 
          'sum_homevalue_avgval_cy',
          'mean_homevalue_avgval_cy', 
          'sum_householdtotals_avghhsz_cy',
          'mean_householdtotals_avghhsz_cy', 
          'mean_housinghousehold䀃x_i',
          'mean_babyproductstoysgames_mp34001a_b_i',
          'mean_leisureactivitieslifestyle_mp05010a_b_i',
          'sum_agedependency_child_cy', 
          'sum_agedependency_child_fy',
          'mean_employmentunemployment_unemprt_cy_i', 
          'mean_health聕x_i',
          'mean_households_acsapovmcf_p', 
          'mean_populationtotals_popgrwcyfy_i',
          'mean_householdtotals_hhgrwcyfy_i', 
          'mean_householdincome_mhigrwcyfy_i',
          'School_Count', 
          'Parking_Count', 
          'Shopping_Count', 
          'child_care_index_mean', 
          'Children_Population_sum',
          'Bought_ChildrenToy_Game_index_mean', 
          'Bought_ChildrenBook_index_mean']

# scale by (x - mean(x)) / sd(x)
scaler = StandardScaler()
sf = scaler.fit(df_model[is_num])

# standardize buyers
df_model_std = df_model.copy()
df_model_std[is_num] = sf.transform(df_model_std[is_num])

In [13]:
evar = ['School_Count', 
        'Parking_Count', 
        'Shopping_Count', 
        'mean_populationtotals_totpop_cy',
        'mean_householdincome_medhinc_cy', 
        'mean_employmentunemployment_unemprt_cy_i', 
        'mean_homevalue_avgval_cy',
        'mean_babyproductstoysgames_mp34001a_b_i',
        'child_care_index_mean', 
        'Bought_ChildrenBook_index_mean']

In [14]:
X = df_model_std[evar]
y = df_model_std.Success
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [15]:
lr1 = LogisticRegression(random_state=42).fit(X_train, y_train)

In [16]:
train_AUC = roc_auc_score(y_train, lr1.predict(X_train))
print(f"""The AUC for train set is {round(train_AUC, 4)}.""")

test_AUC = roc_auc_score(y_test, lr1.predict(X_test))
print(f"""The AUC for test set is {round(test_AUC, 4)}.""")

diff = (test_AUC - train_AUC)/train_AUC
print(f"""The difference is {round(diff*100, 2)}%.""")

The AUC for train set is 0.6547.
The AUC for test set is 0.7778.
The difference is 18.8%.


In [17]:
def nn(X, y, hls=(1,), max_iter=10000):
    return MLPRegressor(
        hidden_layer_sizes=hls, 
        activation='relu', 
        solver='adam', 
        max_iter=max_iter,
        alpha=0.01,       
        shuffle=True,
        verbose=False,
        random_state = 1234).fit(X, y)

In [18]:
hls = [(1,), (3,3), (5,5), (7,7), (9,9)]
# activation = ['relu', 'tanh']
# solver = ['lbfgs', 'adam']
alpha = [0.5, 0.1, 0.05, 0.01]

param_grid = {"hidden_layer_sizes": hls,
              # "activation": activation,
              # "solver": solver,
              "alpha": alpha}

scoring = {"r2": "r2"}

reg_nn = nn(X_train, y_train)
reg_nn_cv = GridSearchCV(
    reg_nn, param_grid, scoring=scoring, cv=10, n_jobs=4, refit="r2", verbose=5
)
reg_nn_cv.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    7.9s finished


GridSearchCV(cv=10,
             estimator=MLPRegressor(alpha=0.01, hidden_layer_sizes=(1,),
                                    max_iter=10000, random_state=1234),
             n_jobs=4,
             param_grid={'alpha': [0.5, 0.1, 0.05, 0.01],
                         'hidden_layer_sizes': [(1,), (3, 3), (5, 5), (7, 7),
                                                (9, 9)]},
             refit='r2', scoring={'r2': 'r2'}, verbose=5)

In [19]:
reg_nn_cv.best_params_

{'alpha': 0.5, 'hidden_layer_sizes': (1,)}

In [20]:
reg_nn_cv.best_score_

-0.18801108357380336

In [21]:
nn = MLPRegressor(
        hidden_layer_sizes=(1,), 
        activation='relu', 
        solver='adam', 
        max_iter=10000,
        alpha=0.5,       
        shuffle=True,
        verbose=False,
        random_state = 1234).fit(X_train, y_train)

In [22]:
train_AUC = roc_auc_score(y_train, nn.predict(X_train))
print(f"""The AUC for train set is {round(train_AUC, 4)}.""")

test_AUC = roc_auc_score(y_test, nn.predict(X_test))
print(f"""The AUC for test set is {round(test_AUC, 4)}.""")

diff = (test_AUC - train_AUC)/train_AUC
print(f"""The difference is {round(diff*100, 2)}%.""")

The AUC for train set is 0.7618.
The AUC for test set is 0.8267.
The difference is 8.51%.
