In [21]:
import warnings
warnings.filterwarnings('ignore')

# Import packages
import pandas as pd
import numpy as np
import itertools
import math
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, PredefinedSplit, cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import roc_curve, precision_recall_curve, plot_confusion_matrix

from cycler import cycler
from livelossplot.outputs import MatplotlibPlot
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [22]:
################################ STEP 1: LOAD DATASET ################################
# Load dataset
data = pd.read_csv("car_data.csv")
# Visualize first five rows in the data
data.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,Male,35,20000,0
1,681,Male,40,43500,0
2,353,Male,49,74000,0
3,895,Male,40,107500,1
4,661,Male,25,79000,0


In [23]:
################################ STEP 2: DATA PRE-PROCESSING ################################

# Create DataFrame
df = pd.DataFrame(data)
# Drop User ID columns
df = df.drop(['User ID'], axis='columns') 
# Visualize first five rows in the updated data
df.head()

Unnamed: 0,Gender,Age,AnnualSalary,Purchased
0,Male,35,20000,0
1,Male,40,43500,0
2,Male,49,74000,0
3,Male,40,107500,1
4,Male,25,79000,0


In [24]:
# One-hot encoding for Gender column (Male/Female)
df = pd.get_dummies(df)

# Reorder columns for better visualization
df = df[['Age', 'AnnualSalary', 'Gender_Female', 'Gender_Male', 'Purchased']]
# Visualize first five rows in the updated data
df.head()

Unnamed: 0,Age,AnnualSalary,Gender_Female,Gender_Male,Purchased
0,35,20000,0,1,0
1,40,43500,0,1,0
2,49,74000,0,1,0
3,40,107500,0,1,1
4,25,79000,0,1,0


In [25]:
# Generate AgeGroup Feature
# Adult(18-39), MidAge(40-59), Senior(60+)
bins= [df.Age.min(),40,60,df.Age.max()+1]
labels = ['Adult','MidAge','Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
df.head()

Unnamed: 0,Age,AnnualSalary,Gender_Female,Gender_Male,Purchased,AgeGroup
0,35,20000,0,1,0,Adult
1,40,43500,0,1,0,MidAge
2,49,74000,0,1,0,MidAge
3,40,107500,0,1,1,MidAge
4,25,79000,0,1,0,Adult


In [26]:
# Generate IncomeGroup Feature
# LowIncome(<32000), MidIncome(32000-99999), HighIncome(>=100000)
bins= [0,32000,100000,df.AnnualSalary.max()+1]
labels = ['LowIncome','MidIncome','HighIncome']
df['IncomeGroup'] = pd.cut(df['AnnualSalary'], bins=bins, labels=labels, right=False)
df.head()

Unnamed: 0,Age,AnnualSalary,Gender_Female,Gender_Male,Purchased,AgeGroup,IncomeGroup
0,35,20000,0,1,0,Adult,LowIncome
1,40,43500,0,1,0,MidAge,MidIncome
2,49,74000,0,1,0,MidAge,MidIncome
3,40,107500,0,1,1,MidAge,HighIncome
4,25,79000,0,1,0,Adult,MidIncome


In [27]:
# One-hot encoding for generated Age and Income groups
df = pd.get_dummies(df)

# Rename columns
df=df.rename(columns = {'AgeGroup_Adult':'Adult','AgeGroup_MidAge':'MidAge','AgeGroup_Senior':'Senior',
                       'IncomeGroup_LowIncome':'LowIncome','IncomeGroup_MidIncome':'MidIncome',
                       'IncomeGroup_HighIncome':'HighIncome'})
# Reorder columns for better visualization
df = df[['Age', 'AnnualSalary', 'Gender_Female', 'Gender_Male', 'Adult', 'MidAge', 'Senior',
        'LowIncome', 'MidIncome', 'HighIncome', 'Purchased']]
# Visualize first five rows in the updated data
df.head()

Unnamed: 0,Age,AnnualSalary,Gender_Female,Gender_Male,Adult,MidAge,Senior,LowIncome,MidIncome,HighIncome,Purchased
0,35,20000,0,1,1,0,0,1,0,0,0
1,40,43500,0,1,0,1,0,0,1,0,0
2,49,74000,0,1,0,1,0,0,1,0,0
3,40,107500,0,1,0,1,0,0,0,1,1
4,25,79000,0,1,1,0,0,0,1,0,0


In [28]:
# Split data into inputs and outputs
X = df.drop('Purchased', axis='columns')
y = df.Purchased

# Split data into training/testing sets with 70%/20%/10% respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 1234)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=int(df.shape[0]*0.2), random_state = 1234)

# Create z-normalization function
def normalizeData(data, mean, std):
    normalized = (data-mean)/(std)
    return normalized

# Create denormalization function
def denormalizeData(normalized_data, mean, std):
    denormalized = (normalized_data*std)+(mean)
    return denormalized

# Mean and standard deviation of training set
Xtrain_mean = np.mean(X_train)
Xtrain_std = np.std(X_train)


# Normalize data
Xtrain_norm = normalizeData(X_train, Xtrain_mean, Xtrain_std)
Xval_norm = normalizeData(X_val, Xtrain_mean, Xtrain_std)
Xtest_norm = normalizeData(X_test, Xtrain_mean, Xtrain_std)

In [29]:
MLPModel = MLPClassifier(activation = 'relu', solver = 'sgd', alpha = 0.1, learning_rate_init = 0.01, 
                        hidden_layer_sizes = (25, 50, 25, 50, 50))

SGDModel = SGDClassifier()

In [30]:
cv_score1 = cross_val_score(MLPModel, Xtrain_norm, y_train, cv=10, scoring='accuracy')
print('Tuned MLP Model:', cv_score1.mean())
cv_score2 = cross_val_score(SGDModel, Xtrainnorm, y_train, cv=10, scoring='accuracy')
print('Default SGD Model', cv_score2.mean())

Tuned MLP Model: 0.9071428571428573


NameError: name 'Xtrain_train' is not defined