** Zhiyu FU **

In [1]:
import numpy as np
from sklearn import svm
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.metrics import classification_report,mean_squared_error, precision_score

In [3]:
auto = pd.read_csv("Auto.csv", na_values = "?")

In [4]:
auto = auto.dropna()

In [5]:
auto.loc[:,"mpg_high"] = auto.mpg>=auto.mpg.median()

In [6]:
X = auto.loc[:,"cylinders":"origin"].values
y = auto.mpg_high.astype("int").values

In [7]:
np.random.seed(15)

# Q1

In [8]:
lr = LogisticRegression()

In [9]:
kf_log = KFold(n_splits=4, shuffle = True, random_state=15)

In [10]:
y_est = np.copy(y)

In [11]:
ind = 1
log_error = np.empty((2, 4))
for train_index, test_index in kf_log.split(X):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    print("k = {}".format(ind))
    print(classification_report(y_test, y_pred))
    print("MSE = {}".format(mean_squared_error(y_test, y_pred)))
    error = y_pred != y_test
    error_0 = ((y_pred == 0) * error).sum() / (y_pred == 0).sum() 
    error_1 = ((y_pred == 1) * error).sum() / (y_pred == 1).sum() 
    print("The error rate for class 0 is {}".format(error_0))
    print("The error rate for class 1 is {}".format(error_1))
    log_error[:, ind-1] = (error_0, error_1)
    ind += 1
    y_est[test_index] = y_pred

k = 1
             precision    recall  f1-score   support

          0       0.94      0.89      0.92        55
          1       0.87      0.93      0.90        43

avg / total       0.91      0.91      0.91        98

MSE = 0.09183673469387756
The error rate for class 0 is 0.057692307692307696
The error rate for class 1 is 0.13043478260869565
k = 2
             precision    recall  f1-score   support

          0       0.88      0.91      0.90        47
          1       0.92      0.88      0.90        51

avg / total       0.90      0.90      0.90        98

MSE = 0.10204081632653061
The error rate for class 0 is 0.12244897959183673
The error rate for class 1 is 0.08163265306122448
k = 3
             precision    recall  f1-score   support

          0       0.85      0.87      0.86        45
          1       0.88      0.87      0.88        53

avg / total       0.87      0.87      0.87        98

MSE = 0.1326530612244898
The error rate for class 0 is 0.15217391304347827
The error

The precision, recall, f1-score and MSE for each test set are as above.

In [12]:
print(classification_report(y, y_est))

             precision    recall  f1-score   support

          0       0.91      0.88      0.89       196
          1       0.88      0.91      0.89       196

avg / total       0.89      0.89      0.89       392



Combine all test sets, for mpg_high == 0, the precision is 0.91, the recall is 0.88, and the f1-score is 0.89; for mpg_high == 1, the precision is 0.88, the recall is 0.91, and the f1-score is 0.89.
On Average, the precision, recall, and f1-score are all 0.89.

In [13]:
mean_squared_error(y, y_est)

0.10714285714285714

The MSE across all test sets are 0.107

In [14]:
print("The average error rate for class 0 and class are as follows, respectively.")
log_error.mean(axis = 1)

The average error rate for class 0 and class are as follows, respectively.


array([ 0.09470671,  0.11822665])

# Q2

In [15]:
forest = RandomForestClassifier(max_features=2, n_estimators=20, bootstrap=True, oob_score=True, random_state=25)

In [16]:
forest.fit(X, y)
y_est2 = forest.oob_decision_function_
y_est2 = (y_est2[:,0] <= y_est2[:,1]).astype('int')

In [17]:
print(classification_report(y, y_est2))

             precision    recall  f1-score   support

          0       0.94      0.91      0.93       196
          1       0.92      0.94      0.93       196

avg / total       0.93      0.93      0.93       392



For mpg_high == 0, the precision is 0.94, the recall is 0.91, and the f1-score is 0.93; for mpg_high == 1, the precision is 0.92, the recall is 0.94, and the f1-score is 0.93.
On Average, the precision, recall, and f1-score are all 0.93.

In [18]:
mean_squared_error(y, y_est2)

0.071428571428571425

The MSE across all test sets are 0.071.

In [19]:
error = y_est2 != y
error_0 = ((y_est2 == 0) * error).sum() / (y_est2 == 0).sum() 
error_1 = ((y_est2 == 1) * error).sum() / (y_est2 == 1).sum() 

In [20]:
print("The error rate 0 category is", error_0)
print("The error rate for the 1 category is", error_1)

The error rate 0 category is 0.0578947368421
The error rate for the 1 category is 0.0841584158416


# Q3

In [21]:
s = SVC(kernel = "rbf", C = 1, gamma = 0.2)

In [22]:
log_error = np.empty((2, 4))
y_est3 = np.zeros(y.shape)
ind = 0
kf_svm = KFold(n_splits=4, shuffle=True, random_state=15)
for train_index, test_index in kf_svm.split(X):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    s.fit(X_train, y_train)
    y_pred = s.predict(X_test)
    error = y_pred != y_test
    error_0 = ((y_pred == 0) * error).sum() / (y_pred == 0).sum() 
    error_1 = ((y_pred == 1) * error).sum() / (y_pred == 1).sum() 
    log_error[:, ind] = (error_0, error_1)
    ind += 1
    y_est3[test_index] = y_pred

  if sys.path[0] == '':


In [23]:
print(classification_report(y, y_est3))

             precision    recall  f1-score   support

          0       0.49      0.73      0.59       196
          1       0.49      0.26      0.33       196

avg / total       0.49      0.49      0.46       392



For mpg_high == 0, the precision is 0.49, the recall is 0.73, and the f1-score is 0.59; for mpg_high == 1, the precision is 0.49, the recall is 0.26, and the f1-score is 0.33.
On Average, the precision and recall are 0.49 and f1-score is 0.46.

In [24]:
mean_squared_error(y, y_est3)

0.50765306122448983

The MSE across all test sets are 0.5076.

In [25]:
print("The average error rate for class 0 and class 1 are as follows, respectively.")
np.nanmean(log_error, axis = 1)

The average error rate for class 0 and class 1 are as follows, respectively.


array([ 0.38119411,  0.18402778])

# Q4

Random forest classifier has the best predictive power. It outperforms other two in all measure of prediction (precision, recall, f1 score, error rate, and mean square error).