# Homework 3: PCA/Hyperparameter/CV

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

In [2]:
data=arff.loadarff('4year.arff')
df=pd.DataFrame(data[0])
df['bankruptcy']=(df['class']==b'1')
del df['class']
df.columns = ['X{0:02d}'.format(k) for k in range(1,65)] + ['bankruptcy']

In [3]:
#fill in NA values
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_imp = imp_mean.fit_transform(df.values)
df.head()

Unnamed: 0,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,...,X56,X57,X58,X59,X60,X61,X62,X63,X64,bankruptcy
0,0.15929,0.4624,0.07773,1.1683,-44.853,0.46702,0.18948,0.82895,1.1223,0.3833,...,0.10899,0.41557,0.89101,0.001422,7.7928,4.9914,119.81,3.0465,3.056,False
1,-0.12743,0.46243,0.26917,1.7517,7.597,0.000925,-0.12743,1.1625,1.2944,0.53757,...,-0.089372,-0.23704,1.0625,0.15041,5.4327,3.4629,100.97,3.615,3.4725,False
2,0.070488,0.2357,0.52781,3.2393,125.68,0.16367,0.086895,2.8718,1.0574,0.67689,...,0.054286,0.10413,0.94571,0.0,7.107,3.3808,76.076,4.7978,4.7818,False
3,0.13676,0.40538,0.31543,1.8705,19.115,0.50497,0.13676,1.4539,1.1144,0.58938,...,0.10263,0.23203,0.89737,0.073024,6.1384,4.2241,88.299,4.1337,4.6484,False
4,-0.11008,0.69793,0.18878,1.2713,-15.344,0.0,-0.11008,0.43282,1.735,0.30207,...,0.43988,-0.3644,0.57153,0.0,18.801,2.7925,146.39,2.4934,15.036,False


In [4]:
#split the train and test dataset
x, y = X_imp[:, :-1], X_imp[:, -1]
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1, stratify=y)

In [5]:
#standardize
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(x_train)
X_test_std = stdsc.transform(x_test)

# Extract 3 features using PCA method.

In [6]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
pca.explained_variance_ratio_


array([0.17982618, 0.10829193, 0.07797324])

# Apply LR / SVM / decision tree, implement the methods using pipeline.

In [7]:
#LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=3),LogisticRegression(penalty='l1'))
pipe_lr.fit(x_train,y_train)
print('Training accuracy:%.3f' %pipe_lr.score(x_train, y_train))
print('Test accuracy:%.3f' %pipe_lr.score(x_test, y_test))

Training accuracy:0.947
Test accuracy:0.946




In [8]:
#SVM
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=3),SVC(kernel='sigmoid', C=1, random_state=1))
pipe_lr.fit(x_train,y_train)

print('Training accuracy:%.3f' %pipe_lr.score(x_train, y_train))
print('Test accuracy:%.3f' %pipe_lr.score(x_test, y_test))



Training accuracy:0.920
Test accuracy:0.912


In [9]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=3),
                      DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1))
pipe_lr.fit(x_train,y_train)

print('Training accuracy:%.3f' %pipe_lr.score(x_train, y_train))
print('Test accuracy:%.3f' %pipe_lr.score(x_test, y_test))

Training accuracy:0.948
Test accuracy:0.946


# Use grid search to find optimal hyperparameters applying 10-fold cross-validation.

In [10]:
#LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

pipe_lr = make_pipeline(StandardScaler(),
                         PCA(n_components=3),
                         LogisticRegression())

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'logisticregression__C': param_range,'logisticregression__penalty':['l2']},
             {'logisticregression__C': param_range,'logisticregression__penalty':['l1']}]


gs = GridSearchCV(estimator=pipe_lr, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
pipe_lr.get_params().keys()
gs = gs.fit(x_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9474759264662971
{'logisticregression__C': 0.0001, 'logisticregression__penalty': 'l1'}




In [11]:
#SVM
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

pipe_svc = make_pipeline(StandardScaler(),
                         PCA(n_components=3),
                         SVC(random_state=1))

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'svc__C': param_range, 
               'svc__kernel': ['sigmoid']},
              {'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(x_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9476218266705574
{'svc__C': 1.0, 'svc__gamma': 10.0, 'svc__kernel': 'rbf'}


In [12]:
# Decision Tree
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

pipe_tree = make_pipeline(StandardScaler(),
                         PCA(n_components=3),
                         DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1))

param_range = [2,3,4,5,6,7]

param_grid = [{'decisiontreeclassifier__max_depth': [2,3,4,5,6,7]}]
gs = GridSearchCV(estimator=pipe_tree, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(x_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9474759264662971
{'decisiontreeclassifier__max_depth': 2}


In [13]:
from sklearn.model_selection import GridSearchCV
pipe_lr = make_pipeline(StandardScaler(),
                         PCA(n_components=3),
                         LogisticRegression())
pipe_lr.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'pca', 'logisticregression', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'pca__copy', 'pca__iterated_power', 'pca__n_components', 'pca__random_state', 'pca__svd_solver', 'pca__tol', 'pca__whiten', 'logisticregression__C', 'logisticregression__class_weight', 'logisticregression__dual', 'logisticregression__fit_intercept', 'logisticregression__intercept_scaling', 'logisticregression__l1_ratio', 'logisticregression__max_iter', 'logisticregression__multi_class', 'logisticregression__n_jobs', 'logisticregression__penalty', 'logisticregression__random_state', 'logisticregression__solver', 'logisticregression__tol', 'logisticregression__verbose', 'logisticregression__warm_start'])

In [1]:
import numpy as np

In [None]:
np.random