# Problem Set #2
MACS 30250, Dr. Evans  
Minghao Yang

## 1. Parallel computing versus serial computing a bootstrapped cross validation (10 points)
### (a)

In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import neighbors, linear_model
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import time
import dask
from dask import delayed, compute

In [2]:
# Load the data as a dataframe and deal with missing data
df1 = pd.read_csv('data/Auto.csv', na_values='?')
df1.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [3]:
# Check the missing values
df1.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      5
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [4]:
# Given that there are only five missing values in horsepower, 
# we could drop these observations without losing much information
df1.dropna(inplace=True)

In [5]:
# Return the median of mpg
print(df1['mpg'].median())

22.75


In [6]:
# Generate the binary variable for mpg
df1['mpg_high'] = df1['mpg'] >= df1['mpg'].median()
df1['mpg_high'] = df1['mpg_high'].astype('int32')

In [7]:
# Generate the binary variables for origin
df1['orgn1'] = df1['origin'] == 1
df1['orgn1'] = df1['orgn1'].astype('int32')
df1['orgn2'] = df1['origin'] == 2
df1['orgn2'] = df1['orgn2'].astype('int32')

In [8]:
# Specify dependent and independent variables
X = df1[['cylinders', 'displacement', 'horsepower', 'weight', 
         'acceleration', 'year', 'orgn1', 'orgn2']]
y = df1['mpg_high']

In [9]:
# Serial computation
err_rate_seri = np.zeros(100)
seeds = np.random.choice(1000, 100, replace=False)

start_time_ser = time.perf_counter()
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=seeds[i])
    LogReg = LogisticRegression(solver='lbfgs', max_iter=5000, n_jobs=None)
    results = LogReg.fit(X_train, y_train)
    y_pred = results.predict(X_test)
    con_mat = confusion_matrix(y_test, y_pred)
    err_rate = (con_mat[0][1] + con_mat[1][0]) / con_mat.sum()
    print("The error rate for bootstrap {} is {:.3f}%".format(i + 1, 100 * err_rate))
    err_rate_seri[i] = err_rate
end_time_ser = time.perf_counter()

print("The average error rate from serial computation is {:.3f}%".format(100 * err_rate_seri.mean()))
print("The serial computation time is {:.3f} s".format(end_time_ser - start_time_ser))    

The error rate for bootstrap 1 is 11.594%
The error rate for bootstrap 2 is 9.420%
The error rate for bootstrap 3 is 11.594%
The error rate for bootstrap 4 is 7.246%
The error rate for bootstrap 5 is 15.217%
The error rate for bootstrap 6 is 10.870%
The error rate for bootstrap 7 is 10.145%
The error rate for bootstrap 8 is 13.768%
The error rate for bootstrap 9 is 10.870%
The error rate for bootstrap 10 is 7.246%
The error rate for bootstrap 11 is 9.420%
The error rate for bootstrap 12 is 15.942%
The error rate for bootstrap 13 is 10.145%
The error rate for bootstrap 14 is 10.870%
The error rate for bootstrap 15 is 14.493%
The error rate for bootstrap 16 is 11.594%
The error rate for bootstrap 17 is 10.870%
The error rate for bootstrap 18 is 10.145%
The error rate for bootstrap 19 is 9.420%
The error rate for bootstrap 20 is 10.145%
The error rate for bootstrap 21 is 10.145%
The error rate for bootstrap 22 is 8.696%
The error rate for bootstrap 23 is 8.696%
The error rate for bootstra

### (b)

In [10]:
# Write a function as required by the problem
def err_rate_calculator(boot_num, seed, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=seed)
    LogReg = LogisticRegression(solver='lbfgs', max_iter=5000)
    results = LogReg.fit(X_train, y_train)
    y_pred = results.predict(X_test)
    con_mat = confusion_matrix(y_test, y_pred)
    err_rate = (con_mat[0][1] + con_mat[1][0]) / con_mat.sum()
    return boot_num, err_rate

In [11]:
# Run the bootstrap by dask.delay
err_rate_par_lazy = []

start_time_par = time.perf_counter()
for i in range(100):
    err_rate_par_lazy.append(delayed(err_rate_calculator)(i + 1, seeds[i], X, y))
err_rate_par_tuple = compute(*err_rate_par_lazy, scheduler=dask.multiprocessing.get, num_workers=4)
end_time_par = time.perf_counter()

err_rate_par_dict = {pair[0]: pair[1] for pair in err_rate_par_tuple}
for i in range(100):
    print("The error rate for bootstrap {} is {:.3f}%".format(i + 1, 100 * err_rate_par_dict[i + 1]))
print("The average error rate from parallel computation is {:.3f}%".format(100 * np.mean(list(err_rate_par_dict.values()))))
print("The parallel computation time is {:.3f} s".format(end_time_par - start_time_par))    

The error rate for bootstrap 1 is 11.594%
The error rate for bootstrap 2 is 9.420%
The error rate for bootstrap 3 is 11.594%
The error rate for bootstrap 4 is 7.246%
The error rate for bootstrap 5 is 15.217%
The error rate for bootstrap 6 is 10.870%
The error rate for bootstrap 7 is 10.145%
The error rate for bootstrap 8 is 13.768%
The error rate for bootstrap 9 is 10.870%
The error rate for bootstrap 10 is 7.246%
The error rate for bootstrap 11 is 9.420%
The error rate for bootstrap 12 is 15.942%
The error rate for bootstrap 13 is 10.145%
The error rate for bootstrap 14 is 10.870%
The error rate for bootstrap 15 is 14.493%
The error rate for bootstrap 16 is 11.594%
The error rate for bootstrap 17 is 10.870%
The error rate for bootstrap 18 is 10.145%
The error rate for bootstrap 19 is 9.420%
The error rate for bootstrap 20 is 10.145%
The error rate for bootstrap 21 is 10.145%
The error rate for bootstrap 22 is 8.696%
The error rate for bootstrap 23 is 8.696%
The error rate for bootstra