In [1]:
import pandas as pd
import numpy as np

In [2]:
auto = pd.read_csv('Auto.csv', na_values='?')
auto = auto.dropna()

In [3]:
auto['mpg_median'] = auto['mpg'].median()
auto['mpg_high'] = auto['mpg'] > auto['mpg_median']
auto['mpg_high'] = auto['mpg_high'].apply(int)
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,mpg_median,mpg_high
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,22.75,0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,22.75,0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,22.75,0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,22.75,0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,22.75,0


# (a) Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

In [5]:
yvals = auto['mpg_high'].values
Xvals = auto[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']].values
k = 4
kf = KFold(n_splits=k, random_state=15, shuffle=True)
kf.get_n_splits(Xvals)
Logit_err_0 = np.zeros(k)
Logit_err_1 = np.zeros(k)
Logit_MSE_VEC = np.zeros(k)

In [6]:
k_ind = int(0)
for train_index, test_index in kf.split(Xvals):
    # print("TRAIN:", train_index, "TEST:", test_index)
    print('If k index=', k_ind + 1)
    X_train, X_test = Xvals[train_index], Xvals[test_index]
    y_train, y_test = yvals[train_index], yvals[test_index]
    LogReg = LogisticRegression(fit_intercept=True)
    LogReg.fit(X_train, y_train)
    y_pred = LogReg.predict(X_test)
    err = y_test != y_pred
    Logit_MSE_VEC[k_ind] = err.mean()
    Logit_err_0[k_ind] = ((y_pred == 0) * err).sum() / (y_pred == 0).sum() 
    Logit_err_1[k_ind] = ((y_pred == 1) * err).sum() / (y_pred == 1).sum() 
    print(classification_report(y_test, y_pred))
    print('The error rate(category 0) is', Logit_err_0[k_ind])
    print('The error rate(category 1) is', Logit_err_1[k_ind])
    print('The MSE is', Logit_MSE_VEC[k_ind])
    k_ind += 1
print('\n k-Fold Reults:')
print('The average error rate(category 0) is', Logit_err_0.mean())
print('The average error rate(category 1) is', Logit_err_1.mean())
print('The average MSE of the model is', Logit_MSE_VEC.mean())

If k index= 1
             precision    recall  f1-score   support

          0       0.94      0.89      0.92        55
          1       0.87      0.93      0.90        43

avg / total       0.91      0.91      0.91        98

The error rate(category 0) is 0.057692307692307696
The error rate(category 1) is 0.13043478260869565
The MSE is 0.09183673469387756
If k index= 2
             precision    recall  f1-score   support

          0       0.88      0.91      0.90        47
          1       0.92      0.88      0.90        51

avg / total       0.90      0.90      0.90        98

The error rate(category 0) is 0.12244897959183673
The error rate(category 1) is 0.08163265306122448
The MSE is 0.10204081632653061
If k index= 3
             precision    recall  f1-score   support

          0       0.85      0.87      0.86        45
          1       0.88      0.87      0.88        53

avg / total       0.87      0.87      0.87        98

The error rate(category 0) is 0.15217391304347827


# (b) Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
RF = RandomForestClassifier(n_estimators=20, max_features=2, bootstrap=True,oob_score=True, random_state=25)
RF.fit(Xvals, yvals)
oob_prediction = RF.oob_decision_function_.T[1]
MSE_RF = pd.DataFrame({'pred' : oob_prediction, 'yvals': yvals})
MSE_RF['pred'] = MSE_RF['pred'].apply(lambda x: 1 if x >= 0.5 else 0)
print(classification_report(MSE_RF['yvals'], MSE_RF['pred']))
error = (MSE_RF['pred'] != MSE_RF['yvals']).apply(int)
RF_err_0 = ((MSE_RF['pred'] == 0) * error).sum() / (MSE_RF['pred'] == 0).sum() 
RF_err_1 = ((MSE_RF['pred'] == 1) * error).sum() / (MSE_RF['pred'] == 1).sum() 
print('The error rate(category 0) is', RF_err_0.mean())
print('The error rate(category 1) is', RF_err_1.mean())
print('The MSE of the model is', error.mean())

             precision    recall  f1-score   support

          0       0.94      0.91      0.93       196
          1       0.92      0.94      0.93       196

avg / total       0.93      0.93      0.93       392

The error rate(category 0) is 0.05789473684210526
The error rate(category 1) is 0.08415841584158416
The MSE of the model is 0.07142857142857142


# (c) SVM

In [9]:
from sklearn import svm

In [10]:
SVC_err_0 = np.zeros(k)
SVC_err_1 = np.zeros(k)
SVC_MSE_VEC = np.zeros(k)
k_ind = int(0)
for train_index, test_index in kf.split(Xvals):
    # print("TRAIN:", train_index, "TEST:", test_index)
    print('\nIf k index=', k_ind + 1)
    X_train, X_test = Xvals[train_index], Xvals[test_index]
    y_train, y_test = yvals[train_index], yvals[test_index]
    svc = svm.SVC(kernel='rbf', gamma=0.2, C=1)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    err = y_test != y_pred
    SVC_MSE_VEC[k_ind] = err.mean()
    SVC_err_0[k_ind] = ((y_pred == 0) * err).sum() / (y_pred == 0).sum() 
    SVC_err_1[k_ind] = ((y_pred == 1) * err).sum() / (y_pred == 1).sum() 
    print(classification_report(y_test, y_pred, digits=3))
    print('The error rate(category 0) is', SVC_err_0[k_ind])
    print('The error rate(category 1) is', SVC_err_1[k_ind])
    print('The MSE is', SVC_MSE_VEC[k_ind])
    k_ind += 1
print('\nk-Fold Reults:')
SVC_err_1 = SVC_err_1[~np.isnan(SVC_err_1)]
print('The average error rate(category 0) is', SVC_err_0.mean())
print('The average error rate(category 1) is', SVC_err_1.mean())
print('The average MSE of the model is', SVC_MSE_VEC.mean())


If k index= 1
             precision    recall  f1-score   support

          0      1.000     0.036     0.070        55
          1      0.448     1.000     0.619        43

avg / total      0.758     0.459     0.311        98

The error rate(category 0) is 0.0
The error rate(category 1) is 0.5520833333333334
The MSE is 0.5408163265306123

If k index= 2
             precision    recall  f1-score   support

          0      0.480     1.000     0.648        47
          1      0.000     0.000     0.000        51

avg / total      0.230     0.480     0.311        98

The error rate(category 0) is 0.5204081632653061
The error rate(category 1) is nan
The MSE is 0.5204081632653061

If k index= 3
             precision    recall  f1-score   support

          0      0.469     1.000     0.638        45
          1      1.000     0.038     0.073        53

avg / total      0.756     0.480     0.332        98

The error rate(category 0) is 0.53125
The error rate(category 1) is 0.0
The MSE is 0

  app.launch_new_instance()
  'precision', 'predicted', average, warn_for)


#### Notice here, we encounter in k=2, the prediction in SVM is always 0 which lead to our calculation of the error rate of category 1 problematic.To avoid the problem, I just erased this case, and get a average of only 3 fold for error rate of category 1.

# (d) Compariosn

According to the results of this case, the random forest has the lowest error rates and lowest MSE for both model. Thus I would say it is best fit for this dataset, SVM is the worst fitted model for the dataset, while Logit is in between. The reason for this I would assume is our feature variables has a mixture of categorical and continous feature. Potentially there are some outlier in the data, thus might heavily influence the performance of SVM data.