In [1]:
#Core modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#modules used in part a
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, mean_squared_error #partb
#modules used in part b
from sklearn.ensemble import RandomForestClassifier
#modules used in part c
from sklearn import svm

In [2]:
#read-in data
#and drop all the nan values and the 'name' variable
auto = pd.read_csv('Auto.csv', na_values='?').dropna().drop(['name'], axis=1) 
#create the median variable
auto['mpg_high'] = auto['mpg'].apply(lambda x: int(x>=auto['mpg'].median()))
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg_high
0,18.0,8,307.0,130.0,3504,12.0,70,1,0
1,15.0,8,350.0,165.0,3693,11.5,70,1,0
2,18.0,8,318.0,150.0,3436,11.0,70,1,0
3,16.0,8,304.0,150.0,3433,12.0,70,1,0
4,17.0,8,302.0,140.0,3449,10.5,70,1,0


In [3]:
y = auto['mpg_high'].values
X = auto[['cylinders', 'displacement', 'horsepower', 'weight',
          'acceleration', 'year', 'origin']].values

### (a) Logit regression with k-fold cross-validation

In [4]:
#K-fold cross-validation
k = 4
kf_log = KFold(n_splits=4, shuffle=True, random_state=15)
kf_log.get_n_splits(X)
k_ind = 0
a_Error0 = np.zeros(k)
a_Error1 = np.zeros(k)
a_MSE = np.zeros(k)

In [5]:
for train_index, test_index in kf_log.split(X):
    print('k index=', k_ind)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    LogReg = LogisticRegression()
    LogReg.fit(X_train, y_train)
    y_pred = LogReg.predict(X_test)
    MSE = pd.DataFrame({'error': (y_test != y_pred).astype(int), 
                    'y_pred': y_pred})
    a_Error0[k_ind] = MSE[MSE['y_pred'] == 0]['error'].mean()
    a_Error1[k_ind] = MSE[MSE['y_pred'] == 1]['error'].mean()
    a_MSE[k_ind] = MSE['error'].mean()
    print('\n',classification_report(y_test,y_pred,digits=3))
    print('Error rate for category 0 is {}'.format(a_Error0[k_ind]))
    print('Error rate for category 1 is {}'.format(a_Error1[k_ind]))
    print('MSE for test set {} is {}\n'.format(k_ind, a_MSE[k_ind]))
    k_ind += 1
print('\nk-fold results:')
print('Average error rate (category 0) is {}, std is {}'.format(a_Error0.mean(),a_Error0.std()))
print('Average error rate (category 1) is {}, std is {}'.format(a_Error1.mean(),a_Error1.std()))
print('Average MSE for this model is {}, std is {}'.format(a_MSE.mean(),a_MSE.std()))

k index= 0

              precision    recall  f1-score   support

          0      0.942     0.891     0.916        55
          1      0.870     0.930     0.899        43

avg / total      0.910     0.908     0.908        98

Error rate for category 0 is 0.057692307692307696
Error rate for category 1 is 0.13043478260869565
MSE for test set 0 is 0.09183673469387756

k index= 1

              precision    recall  f1-score   support

          0      0.878     0.915     0.896        47
          1      0.918     0.882     0.900        51

avg / total      0.899     0.898     0.898        98

Error rate for category 0 is 0.12244897959183673
Error rate for category 1 is 0.08163265306122448
MSE for test set 1 is 0.10204081632653061

k index= 2

              precision    recall  f1-score   support

          0      0.848     0.867     0.857        45
          1      0.885     0.868     0.876        53

avg / total      0.868     0.867     0.867        98

Error rate for category 0 is 0.15

## (b) Random Forest

Please see the following discussion for more details of this assignment:

https://github.com/UC-MACSS/persp-model_W18/issues/267

In [6]:
RF = RandomForestClassifier(n_estimators=20, max_features=2, bootstrap=True,
                           oob_score=True, random_state=25)
RF.fit(X, y)
oob_prediction_ = RF.oob_decision_function_.T[1]
MSE_RF = pd.DataFrame({'pred' : oob_prediction_, 'yvals': y})
MSE_RF['pred'] = MSE_RF['pred'].apply(lambda x: int(x>=0.5))
b_MSE = mean_squared_error(MSE_RF['yvals'],MSE_RF['pred'])
MSE_RF_0 = MSE_RF[MSE_RF['pred'] < 0.5]
b_Error0 = mean_squared_error(MSE_RF_0['yvals'], MSE_RF_0['pred'])
MSE_RF_1 = MSE_RF[MSE_RF['pred'] >= 0.5]
b_Error1 = mean_squared_error(MSE_RF_1['yvals'], MSE_RF_1['pred'])
print('Error rate (category 0) is {}'.format(b_Error0))
print('Error rate (category 1) is {}'.format(b_Error1))
print('MSE of the model is {}'.format(b_MSE))

Error rate (category 0) is 0.05789473684210526
Error rate (category 1) is 0.08415841584158416
MSE of the model is 0.07142857142857142


## (c) SVC

The SVC estimate will sometimes yield such a bad estimate that there is not a single prediction pointed to one estimate. Be careful when interpreting the results.

In [7]:
#K-fold cross-validation
k = 4
kf_svm = KFold(n_splits=4, shuffle=True, random_state=15)
kf_svm.get_n_splits(X)
k_ind = 0
c_Error0 = np.zeros(k)
c_Error1 = np.zeros(k)
c_MSE = np.zeros(k)

In [8]:
for train_index, test_index in kf_svm.split(X):
    print('k index=', k_ind)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svc = svm.SVC(kernel='rbf', gamma=0.2, C=1)
    svc.fit(X_train,y_train)
    y_pred = svc.predict(X_test)
    MSE = pd.DataFrame({'error': (y_test != y_pred).astype(int), 
                    'y_pred': y_pred})
    c_Error0[k_ind] = MSE[MSE['y_pred'] == 0]['error'].mean()
    c_Error1[k_ind] = MSE[MSE['y_pred'] == 1]['error'].mean() #warning message occurs because this line is empty
    c_MSE[k_ind] = MSE['error'].mean()
    print('\n',classification_report(y_test,y_pred,digits=3))
    print('Error rate for category 0 is {}'.format(c_Error0[k_ind]))
    if np.isnan(c_Error1[k_ind]):
        print('Error rate for this category is nan (since all predictions are 0)')
    else:
        print('Error rate for category 1 is {}'.format(c_Error1[k_ind]))
    print('MSE for test set {} is {}\n'.format(k_ind, c_MSE[k_ind]))
    k_ind += 1
c_Error0 = c_Error0[~np.isnan(c_Error0)]
c_Error1 = c_Error1[~np.isnan(c_Error1)]
print('\nk-fold results:')
print('Average error rate (category 0) is {}, std is {}'.format(c_Error0.mean(),c_Error0.std()))
print('Average error rate (category 1) is {}, std is {}'.format(c_Error1.mean(),c_Error1.std()))
print('Average MSE for this model is {}, std is {}'.format(c_MSE.mean(),c_MSE.std()))

k index= 0

              precision    recall  f1-score   support

          0      1.000     0.036     0.070        55
          1      0.448     1.000     0.619        43

avg / total      0.758     0.459     0.311        98

Error rate for category 0 is 0.0
Error rate for category 1 is 0.5520833333333334
MSE for test set 0 is 0.5408163265306123

k index= 1

              precision    recall  f1-score   support

          0      0.480     1.000     0.648        47
          1      0.000     0.000     0.000        51

avg / total      0.230     0.480     0.311        98

Error rate for category 0 is 0.5204081632653061
Error rate for this category is nan (since all predictions are 0)
MSE for test set 1 is 0.5204081632653061

k index= 2

              precision    recall  f1-score   support

          0      0.469     1.000     0.638        45
          1      1.000     0.038     0.073        53

avg / total      0.756     0.480     0.332        98

Error rate for category 0 is 0.53125


  'precision', 'predicted', average, warn_for)


## (d) Comparison

In [9]:
pd.DataFrame({'Logit':[a_MSE.mean(),a_Error0.mean(),a_Error1.mean()], 
              'RF':[b_MSE.mean(),b_Error0.mean(),b_Error1.mean()], 
              'SVC':[c_MSE.mean(),c_Error0.mean(),c_Error1.mean()]},
             index = ['MSE','Category 0','Category 1'])

Unnamed: 0,Logit,RF,SVC
MSE,0.107143,0.071429,0.507653
Category 0,0.094707,0.057895,0.381194
Category 1,0.118227,0.084158,0.184028


##### Random Forest Classifier seems to be the best model

Overall, the Random Forest Classifier (RF) yields the lowest MSE and the error rates within each categories. The support vector classifiers (SVC) yields a increadibly high MSE and error rates of each categories. The result of Logit regression is reasonable but not as good as other the Random Forest Classifier.

##### But this is an unfair comparison since we are using different validation methods to calculate the MSE

But one thing needs notice is that in part b) we in fact used bootstrapping to calculate the MSE, while in part a) and part c) we used k-fold validation to calculate the MSE. As we have discussed in class, bootstrapping is very likely to yield a lower MSE compare with k-fold and therefore it is unfair to argue that logit regression is de facto worse than the Random Forest Classifier.

##### Therefore random forest classifier is not necessarily bettter than the logit regression model

Finally we reach our conclusion that Random Forest Classifier is likely but necessarily better than the logit regression. More analysis should be done to compare these two methods (for example, use bootstrapping to calculate the MSE of the logit model).