In [1]:
# 1. Classifier "horse" race
import pandas as pd

df = pd.read_csv("auto.csv", na_values = "?")

df.isna().sum()
df = df.dropna()
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [2]:
df = df.drop(columns = ["name"])

cutoff = df["mpg"].median()

def mpg_trans(col):
    mpg = col
    if mpg >= cutoff:
        return 1
    else:
        return 0
    
df["mpg_high"] = df["mpg"].apply(mpg_trans)

In [3]:
# (a)
df["horsepower"] = df["horsepower"].astype("int")
df["mpg_high"] = df["mpg_high"].astype("category")
Xvars = df[["cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin"]].values
yvals = df[["mpg_high"]].values

In [4]:
import numpy as np

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

k = 4
kf = KFold(n_splits = k, shuffle = True, random_state = 15)
kf.get_n_splits(Xvars)

MSE_vec_kf = np.zeros(k)
error_rate_0 = np.zeros(k)
error_rate_1 = np.zeros(k)

k_ind = int(0)

for train_index, test_index in kf.split(Xvars):
    
    X_train, X_test = Xvars[train_index], Xvars[test_index]
    y_train, y_test = yvals[train_index], yvals[test_index]
    
    LogReg = LogisticRegression(fit_intercept = True)
    LogReg.fit(X_train, y_train)
    y_pred = LogReg.predict(X_test)
    
    MSE_vec_kf[k_ind] = ((y_test - y_pred) ** 2).mean()
    
    y_test.shape = (len(y_test), )
    temp = pd.DataFrame({"y_test": list(y_test), "y_pred": list(y_pred)})
    error_0 = temp[(temp["y_pred"] == 0) & (temp["y_pred"] != temp["y_test"])]
    error_1 = temp[(temp["y_pred"] == 1) & (temp["y_pred"] != temp["y_test"])]
    
    error_rate_0[k_ind] = len(error_0)/len(temp[(temp["y_pred"] == 0)])
    error_rate_1[k_ind] = len(error_1)/len(temp[(temp["y_pred"] == 1)])
    
    print("k index =", k_ind)
    print("MSE for test set", k_ind, "is", MSE_vec_kf[k_ind])
    print("")
    print("Error rate for category 0 in test set", k_ind, "is", error_rate_0[k_ind])
    print("Error rate for category 1 in test set", k_ind, "is", error_rate_1[k_ind]) 
    print("")
    print(classification_report(y_test, y_pred, digits = 4))
    
    k_ind += 1

print("Average MSE k-fold =", MSE_vec_kf.mean())
print("Average MSE standard err =", MSE_vec_kf.std())
print("Average error rate for category 0 =", error_rate_0.mean())
print("Average error rate for category 0 standard err =", error_rate_0.std())
print("Average error rate for category 1 =", error_rate_1.mean())
print("Average error rate for category 1 standard err =", error_rate_1.std())

k index = 0
MSE for test set 0 is 0.49625156184922947

Error rate for category 0 in test set 0 is 0.057692307692307696
Error rate for category 1 in test set 0 is 0.13043478260869565

             precision    recall  f1-score   support

          0     0.9423    0.8909    0.9159        55
          1     0.8696    0.9302    0.8989        43

avg / total     0.9104    0.9082    0.9084        98

k index = 1
MSE for test set 1 is 0.5

Error rate for category 0 in test set 1 is 0.12244897959183673
Error rate for category 1 in test set 1 is 0.08163265306122448

             precision    recall  f1-score   support

          0     0.8776    0.9149    0.8958        47
          1     0.9184    0.8824    0.9000        51

avg / total     0.8988    0.8980    0.8980        98

k index = 2
MSE for test set 2 is 0.4975010412328197

Error rate for category 0 in test set 2 is 0.15217391304347827
Error rate for category 1 in test set 2 is 0.11538461538461539

             precision    recall  f1-sco

  y = column_or_1d(y, warn=True)


In [5]:
# (b)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

yvals.shape = (len(yvals), )
RandClass = RandomForestClassifier(n_estimators = 20, max_features = 2, random_state = 25,\
                                   bootstrap = True, oob_score = True)
RandClass.fit(Xvars, yvals)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=True, random_state=25, verbose=0, warm_start=False)

In [6]:
oob_prediction = RandClass.oob_decision_function_.T[1]
MSE_RandClass = pd.DataFrame({"pred": oob_prediction, "yvals": yvals})
MSE_RandClass["pred"] = MSE_RandClass["pred"].apply(lambda x: 1 if x >= 0.5 else 0)

def calculate_MSE(df):
    return mean_squared_error(df["yvals"], df["pred"])

MSE = calculate_MSE(MSE_RandClass)

MSE_RandClass_0 = MSE_RandClass[MSE_RandClass["pred"] < 0.5]
MSE_0 = calculate_MSE(MSE_RandClass_0)

MSE_RandClass_1 = MSE_RandClass[MSE_RandClass["pred"] >= 0.5]
MSE_1 = calculate_MSE(MSE_RandClass_1)

print('The MSE of the model is', MSE)
print('The error rate for category 0 is', MSE_0)
print('The error rate for category 1 is', MSE_1)

The MSE of the model is 0.07142857142857142
The error rate for category 0 is 0.05789473684210526
The error rate for category 1 is 0.08415841584158416


In [7]:
# (c)
from sklearn import svm

k = 4
kf = KFold(n_splits = k, shuffle = True, random_state = 15)
kf.get_n_splits(Xvars)

MSE_vec_kf_svm = np.zeros(k)
error_rate_0_svm = np.zeros(k)
error_rate_1_svm = np.zeros(k)

k_ind = int(0)

for train_index, test_index in kf.split(Xvars):
    
    X_train, X_test = Xvars[train_index], Xvars[test_index]
    y_train, y_test = yvals[train_index], yvals[test_index]
    
    svc = svm.SVC(kernel = "rbf", C = 1, gamma = 0.2)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    
    MSE_vec_kf_svm[k_ind] = ((y_test - y_pred) ** 2).mean()
    
    y_test.shape = (len(y_test), )
    temp = pd.DataFrame({"y_test": list(y_test), "y_pred": list(y_pred)})
    
    error_0 = temp[(temp["y_pred"] == 0) & (temp["y_pred"] != temp["y_test"])]
    error_1 = temp[(temp["y_pred"] == 1) & (temp["y_pred"] != temp["y_test"])]
        
    try:
        error_rate_0_svm[k_ind] = len(error_0)/len(temp[(temp["y_pred"] == 0)])
    except:
        error_rate_0_svm[k_ind] = np.inf
        
    try:
        error_rate_1_svm[k_ind] = len(error_1)/len(temp[(temp["y_pred"] == 1)])
    except:
        error_rate_1_svm[k_ind] = np.inf
    
    print("k index =", k_ind)
    print("MSE for test set", k_ind, "is", MSE_vec_kf[k_ind])
    print("")
    print("Error rate for category 0 in test set", k_ind, "is", error_rate_0_svm[k_ind])
    print("Error rate for category 1 in test set", k_ind, "is", error_rate_1_svm[k_ind]) 
    print("")
    print(classification_report(y_test, y_pred, digits = 4))
    
    k_ind += 1

print("Average MSE k-fold =", MSE_vec_kf_svm.mean())
print("Average MSE standard err =", MSE_vec_kf_svm.std())
print("")

print("Average error rate for category 0 =", error_rate_0_svm.mean())
print("Average error rate for category 0 standard err =", error_rate_0_svm.std())
print("")

error_rate_1_svm = error_rate_1_svm[ error_rate_1_svm < 1]
print("Average error rate for category 1 =", error_rate_1_svm.mean())
print("Average error rate for category 1 standard err =", error_rate_1_svm.std())

k index = 0
MSE for test set 0 is 0.49625156184922947

Error rate for category 0 in test set 0 is 0.0
Error rate for category 1 in test set 0 is 0.5520833333333334

             precision    recall  f1-score   support

          0     1.0000    0.0364    0.0702        55
          1     0.4479    1.0000    0.6187        43

avg / total     0.7578    0.4592    0.3109        98

k index = 1
MSE for test set 1 is 0.5

Error rate for category 0 in test set 1 is 0.5204081632653061
Error rate for category 1 in test set 1 is inf

             precision    recall  f1-score   support

          0     0.4796    1.0000    0.6483        47
          1     0.0000    0.0000    0.0000        51

avg / total     0.2300    0.4796    0.3109        98

k index = 2
MSE for test set 2 is 0.4975010412328197

Error rate for category 0 in test set 2 is 0.53125
Error rate for category 1 in test set 2 is 0.0

             precision    recall  f1-score   support

          0     0.4688    1.0000    0.6383       

  'precision', 'predicted', average, warn_for)


In [8]:
# (d)
print("LogisticRegression:", "MSE mean:",  MSE_vec_kf.mean())
print("LogisticRegression:", "error rate mean for category 0:",  error_rate_0.mean())
print("LogisticRegression:", "error rate mean for category 1:",  error_rate_1.mean())
print("")

print("RandomForest:", "MSE mean:",  MSE)
print("RandomForest:", "error rate mean for category 0:",  MSE_0)
print("RandomForest:", "error rate mean for category 1:",  MSE_1)
print("")

print("SupportVectorMachines:", "MSE mean:",  MSE_vec_kf_svm.mean())
print("SupportVectorMachines:", "error rate mean for category 0:",  error_rate_0_svm.mean())
print("SupportVectorMachines:", "error rate mean for category 1:",  error_rate_1_svm.mean())
print("")

LogisticRegression: MSE mean: 0.4984381507705123
LogisticRegression: error rate mean for category 0: 0.09470670705864986
LogisticRegression: error rate mean for category 1: 0.11822664912727025

RandomForest: MSE mean: 0.07142857142857142
RandomForest: error rate mean for category 0: 0.05789473684210526
RandomForest: error rate mean for category 1: 0.08415841584158416

SupportVectorMachines: MSE mean: 0.5076530612244898
SupportVectorMachines: error rate mean for category 0: 0.38119411070879966
SupportVectorMachines: error rate mean for category 1: 0.1840277777777778



From the comparison above, the random forest model has the best result, including the the lowest MSE, the lowest error rate for category 0 as well as the lowest error rate for category 1. The second best model is logistic regression model, and the support vector machines model is the worst model.

Based on the calculation above, I think that the random forest model is the best predictor of mpg_high, although we should keep in mind that the random forest model is the only one which doesn't use k-fold cross validation.