### Perspective PS8

In [1]:
# neural network horse race 
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import itertools
import warnings
warnings.filterwarnings("ignore") 
sns.set(style="darkgrid")
dta = pd.read_csv('strongdrink.txt')

In [2]:
dta.head()

Unnamed: 0,cultivar,alco,malic,ash,alk,magn,tot_phen,flav,nonfl_phen,proanth,color_int,hue,OD280rat,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
dta.isnull().sum()

cultivar      0
alco          0
malic         0
ash           0
alk           0
magn          0
tot_phen      0
flav          0
nonfl_phen    0
proanth       0
color_int     0
hue           0
OD280rat      0
proline       0
dtype: int64

In [4]:
# a). scatterplot
%matplotlib notebook
for x,c in enumerate(['r','orange','b'], 1):
    plt.scatter(dta[dta['cultivar'] == x].alco, dta[dta['cultivar'] == x].color_int,\
                color = c, label = 'cultivar ={}'.format(x))
    plt.title('Alcohol and Color Intensity')
    plt.xlabel('alcohol')
    plt.ylabel('Color Intensity')
    plt.legend()

<IPython.core.display.Javascript object>

In [5]:
# b). fit a multinomial logistic model
Xvals = dta[['alco', 'malic', 'tot_phen', 'color_int']].values
yvals = dta['cultivar'].values

k = 4
clf_mlog = KFold(n_splits=k, random_state=22, shuffle=True)
clf_mlog.get_n_splits(Xvals)
MSE = np.zeros(k)
C = np.zeros(200)
MSE_C = np.zeros(200)

for index in range(200):
    k_ind = int(0)
    c = (index + 1)/100
    for train_index, test_index in clf_mlog.split(Xvals):
        X_train, X_test = Xvals[train_index], Xvals[test_index]
        y_train, y_test = yvals[train_index], yvals[test_index]
        LogReg = LogisticRegression(multi_class='multinomial', fit_intercept = True,
                                    solver='newton-cg', C = c)
        LogReg.fit(X_train, y_train)
        y_pred = LogReg.predict(X_test)
        error = y_test != y_pred
        MSE[k_ind] = error.mean()
        k_ind += 1
    C[index] = c
    MSE_C[index] =  MSE.mean()
MSE_logit = pd.DataFrame({'C':C, 'MSE':MSE_C})
MSE_logit.sort_values(['MSE']).head(5)

Unnamed: 0,C,MSE
43,0.44,0.068182
44,0.45,0.068182
42,0.43,0.068182
41,0.42,0.068182
40,0.41,0.068182


In [9]:
MSE_rf = pd.DataFrame({"n_estimator" : np.zeros(5),
                         "min_sample" : np.zeros(5),
                         "max_depth" : np.zeros(5),
                         "MSE" : np.zeros(5)})
MSE_df = MSE_rf[:0]

for i in range(15):
    for j in range(15):
        for tree in range(5):
            rf = RandomForestClassifier(n_estimators = (tree * 50 + 50),
                                        min_samples_leaf = (i * 5 + 5),
                                        max_depth = (j + 1), bootstrap=True, 
                                        oob_score=True, random_state=22)
            rf.fit(Xvals, yvals)
            MSE_rf["n_estimator"][tree] = tree * 50 + 50
            MSE_rf["max_depth"][tree] = j + 1
            MSE_rf["min_sample"][tree] = i * 5 + 5
            MSE_rf["MSE"][tree] = 1 - rf.oob_score_
        MSE_df= pd.concat([MSE_df, MSE_rf])

In [10]:
MSE_df.index = range(len(MSE_df))
MSE_df.sort_values(['MSE']).head(5)

Unnamed: 0,MSE,max_depth,min_sample,n_estimator
96,0.068182,5.0,10.0,100.0
126,0.068182,11.0,10.0,100.0
116,0.068182,9.0,10.0,100.0
131,0.068182,12.0,10.0,100.0
111,0.068182,8.0,10.0,100.0


When max_depth = 5, min_sample = 10 and n_estimator = 100, the random forest model gives the minimum MSE 0.068. 

In [11]:
# d). fit the SVM
k = 4
clf_svm = KFold(n_splits=k, random_state=22, shuffle=True)
clf_svm.get_n_splits(Xvals)
MSE = np.zeros(k)
MSE_C = pd.DataFrame({"Cost" : np.zeros(80),
                      "Gamma" : np.zeros(80),
                      "MSE" : np.zeros(80)})
MSE_SVM = MSE_C[:0]

for g in range(80):
    for c in range(80):
        k_ind = int(0)
        for train_index, test_index in clf_svm.split(Xvals):
            X_train, X_test = Xvals[train_index], Xvals[test_index]
            y_train, y_test = yvals[train_index], yvals[test_index]
            svc = SVC(kernel='rbf', gamma = (g/20 + 0.05),
                          C=c/20 + 0.05)
            svc.fit(X_train, y_train)
            y_pred = svc.predict(X_test)
            error = y_test != y_pred
            MSE[k_ind] = error.mean()
            k_ind += 1
        MSE_C['Cost'][c] = c/20 + 0.05
        MSE_C['Gamma'][c] = g/20 + 0.05
        MSE_C['MSE'][c] =  MSE.mean()
    MSE_SVM = pd.concat([MSE_SVM, MSE_C])
MSE_SVM.index = range(len(MSE_SVM))
MSE_SVM.sort_values(['MSE']).head(5)

Unnamed: 0,Cost,Gamma,MSE
2659,1.0,1.7,0.045455
2578,0.95,1.65,0.045455
2658,0.95,1.7,0.045455
224,3.25,0.15,0.051136
225,3.3,0.15,0.051136


The minimum MSE is obtained when C = 1 and Gamma = 1.7, and the minimum is 0.045.

In [12]:
# e). neutral network 
k = 4
clf_mlp = KFold(n_splits=k, random_state=22, shuffle=True)
clf_mlp.get_n_splits(Xvals)
MSE = np.zeros(k)
activ = np.array(['identity', 'logistic', 'tanh', 'relu'])
MSE_al = pd.DataFrame({"activation" : np.zeros(20),
                       "hidden layer" : np.zeros(20),
                       "alpha" : np.zeros(20),
                       "MSE":np.zeros(20)})
MSE_mlp = MSE_al[:0]

for ac in range(4):
    for h in range(8):
        for al in range(20):
            k_ind = int(0)
            for train_index, test_index in clf_mlp.split(Xvals):
                X_train, X_test = Xvals[train_index], Xvals[test_index]
                y_train, y_test = yvals[train_index], yvals[test_index]
                mlp = MLPClassifier(activation=activ[ac], solver='lbfgs',
                                    alpha=(al/20 + 0.05), random_state=25,
                                    hidden_layer_sizes = ((50 * (h + 1)),))
                mlp.fit(X_train, y_train)
                y_pred = mlp.predict(X_test)
                error = y_test != y_pred
                MSE[k_ind] = error.mean()
                k_ind += 1
            MSE_al['activation'][al] = activ[ac]
            MSE_al['hidden layer'][al] = 50 * (h + 1)
            MSE_al['alpha'][al] = al/20 + 0.05
            MSE_al['MSE'][al] =  MSE.mean()
        MSE_mlp = pd.concat([MSE_mlp, MSE_al])
MSE_mlp.index = range(len(MSE_mlp))
MSE_mlp.sort_values(['MSE']).head(5)

Unnamed: 0,MSE,activation,alpha,hidden layer
532,0.034091,relu,0.65,150.0
585,0.039773,relu,0.3,300.0
562,0.039773,relu,0.15,250.0
553,0.039773,relu,0.7,200.0
551,0.039773,relu,0.6,200.0


When activation = 'relu', alpha = 0.65 and hidden_layer_sizes = 150, the MLP model gets the lowest MSE of 0.034091.

In [17]:
# f). Compare 
logit = np.array(MSE_logit.sort_values(['MSE']).head(5)['MSE'])
RF = np.array(MSE_df.sort_values(['MSE']).head(5)['MSE'])
SVM = np.array(MSE_SVM.sort_values(['MSE']).head(5)['MSE'])
mlp = np.array(MSE_mlp.sort_values(['MSE']).head(5)['MSE'])

Compare = pd.DataFrame({'Logit':logit, 
                    'RF':RF,
                    'SVM':SVM,
                    'MLP':mlp})
Compare 

Unnamed: 0,Logit,MLP,RF,SVM
0,0.068182,0.034091,0.068182,0.045455
1,0.068182,0.039773,0.068182,0.045455
2,0.068182,0.039773,0.068182,0.045455
3,0.068182,0.039773,0.068182,0.051136
4,0.068182,0.039773,0.068182,0.051136


From above table we can see that the five smalleset MSE for model MLP outperform the other three models. Hence, we determine the MLP is the best model in this case.