# Homework #3: PCA/Hyperparameter/CV
Data source: http://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

In [2]:
data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])

In [3]:
df['bankruptcy'] = (df['class']==b'1')
df.drop(columns=['class'], inplace=True)
df.columns = ['X{0:02d}'.format(k) for k in range(1,65)] + ['bankruptcy']

In [4]:
for i in range(len(df['bankruptcy'])):
    df.loc[i, 'bankruptcy'] = 0 if df.loc[i,'bankruptcy'] == False else 1

In [5]:
df.describe()

Unnamed: 0,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,...,X55,X56,X57,X58,X59,X60,X61,X62,X63,X64
count,9791.0,9791.0,9791.0,9749.0,9771.0,9791.0,9791.0,9773.0,9792.0,9791.0,...,9792.0,9771.0,9791.0,9776.0,9791.0,9178.0,9760.0,9771.0,9749.0,9561.0
mean,0.043019,0.596404,0.130959,8.1366,64.65164,-0.059273,0.059446,19.884016,1.882296,0.38904,...,7686.33,-0.992263,0.035022,1.133287,0.856053,118.156064,25.19443,2015.157,8.660813,35.949619
std,0.359321,4.587122,4.559074,290.647281,14759.39,6.812754,0.533344,698.697015,17.67465,4.590299,...,76052.61,77.007971,8.945365,8.038201,26.393305,3230.316692,1099.260821,117146.1,60.838202,483.318623
min,-12.458,0.0,-445.91,-0.045319,-379460.0,-486.82,-12.458,-1.8482,-0.032371,-445.91,...,-713220.0,-7522.1,-597.42,-30.892,-284.38,0.0,-12.656,-14965.0,-0.02439,-1.5e-05
25%,0.001321,0.263145,0.020377,1.047,-51.217,-0.000578,0.003004,0.4283,1.006675,0.29444,...,21.84,0.003121,0.008768,0.885722,0.0,5.356325,4.2677,43.234,2.9388,2.0129
50%,0.041364,0.46774,0.19929,1.5918,-0.055576,0.0,0.04882,1.0887,1.1613,0.51045,...,950.33,0.043679,0.098026,0.958305,0.002129,9.482,6.28355,74.729,4.8489,4.0416
75%,0.11113,0.689255,0.41067,2.8804,55.732,0.065322,0.12694,2.691,1.970225,0.71429,...,4694.55,0.11717,0.24268,0.996163,0.21179,19.506,9.9382,123.345,8.3638,9.4135
max,20.482,446.91,22.769,27146.0,1034100.0,322.2,38.618,53209.0,1704.8,12.602,...,6123700.0,112.02,226.76,668.75,1661.0,251570.0,108000.0,10779000.0,5662.4,21153.0


In [6]:
sum(df.bankruptcy == True)

515

In [7]:
df.fillna(df.mean(), inplace=True)
df.isna().sum()
X_imp = df.values

In [8]:
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

X, y = X_imp[:, :-1], X_imp[:, -1]

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [9]:
import sklearn.preprocessing as skpre

stdsc = skpre.StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
print(X_train_std.shape)
X_test_std = stdsc.transform(X_test)
print(X_test_std.shape)

(6854, 64)
(2938, 64)


## Extract 3 features using PCA method

In [10]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [11]:
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

### LR

In [12]:
lr = LogisticRegression(C=0.01, random_state=0, penalty='l1')
lr.fit(X_train_pca, y_train)
print('Training accuracy: %.4f' % lr.score(X_train_pca, y_train))
print('Test accuracy: %.4f' % lr.score(X_test_pca, y_test))

Training accuracy: 0.9473
Test accuracy: 0.9462


### SVM

In [13]:
svm = SVC(C=0.01, random_state=0, kernel='rbf')
svm.fit(X_train_pca,y_train)
print('Training accuracy: %.4f' % svm.score(X_train_pca, y_train))
print('Test accuracy: %.4f'% svm.score(X_test_pca, y_test))

Training accuracy: 0.9475
Test accuracy: 0.9472


### Decision Tree

In [14]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=2)
tree.fit(X_train_pca,y_train)
print('Training accuracy: %.4f' % tree.score(X_train_pca, y_train))
print('Test accuracy: %.4f' % tree.score(X_test_pca, y_test))

Training accuracy: 0.9475
Test accuracy: 0.9472


## PCA using pipeline


### LR

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=3), 
                        LogisticRegression(C=0.01, random_state=0, penalty='l1'))
pipe_lr.fit(X_train, y_train)
print('Training accuracy: %.4f' % pipe_lr.score(X_train, y_train))
print('Test accuracy: %.4f' % pipe_lr.score(X_test, y_test))

Training accuracy: 0.9473
Test accuracy: 0.9462


### SVM

In [16]:
pipe_svm = make_pipeline(StandardScaler(), PCA(n_components=3),
                         SVC(C=0.01, random_state=0, kernel='rbf'))
pipe_svm.fit(X_train, y_train)
print('Training accuracy: %.4f' % pipe_svm.score(X_train, y_train))
print('Test accuracy: %.4f' % pipe_svm.score(X_test, y_test))

Training accuracy: 0.9475
Test accuracy: 0.9472


### Decision Tree

In [17]:
pipe_tree = make_pipeline(StandardScaler(), PCA(n_components=3),
                          DecisionTreeClassifier(criterion='gini', max_depth=2))
pipe_tree.fit(X_train, y_train)
print('Training accuracy: %.4f' % pipe_tree.score(X_train, y_train))
print('Test accuracy: %.4f' % pipe_tree.score(X_test, y_test))

Training accuracy: 0.9475
Test accuracy: 0.9472


## 5 fold Cross-Validation

In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

### LR

In [19]:
kfold = StratifiedKFold(n_splits=5).split(X_train, y_train)
pipe_lr_cv = make_pipeline(StandardScaler(), PCA(n_components=3), 
                        LogisticRegression(C=0.01, random_state=0, penalty='l1'))
score_lr_cv = []
for k, (train, test) in enumerate(kfold):
    pipe_lr_cv.fit(X_train[train], y_train[train])
    score = pipe_lr_cv.score(X_train[test], y_train[test])
    score_lr_cv.append(score)

print('CV accuracy of LR:', score_lr_cv)

CV accuracy of LR: [0.9474835886214442, 0.9474835886214442, 0.9474835886214442, 0.9474835886214442, 0.9467153284671533]


In [20]:
cv_score_lr = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=5)
print('CV accuracy of LR:', cv_score_lr)

CV accuracy of LR: [0.94748359 0.94748359 0.94748359 0.94748359 0.94671533]


### SVM

In [21]:
kfold = StratifiedKFold(n_splits=5).split(X_train, y_train)
pipe_svm_cv = make_pipeline(StandardScaler(), PCA(n_components=3),
                         SVC(C=0.01, random_state=0, kernel='rbf'))
score_svm_cv = []
for k, (train, test) in enumerate(kfold):
    pipe_svm_cv.fit(X_train[train], y_train[train])
    score = pipe_svm_cv.score(X_train[test], y_train[test])
    score_svm_cv.append(score)
print('CV accuracy of SVM:', score_svm_cv)

CV accuracy of SVM: [0.9474835886214442, 0.9474835886214442, 0.9474835886214442, 0.9474835886214442, 0.9474452554744526]


In [22]:
cv_score_svm = cross_val_score(estimator=pipe_svm, X=X_train, y=y_train, cv=5)
print('CV accuracy of SVM:', cv_score_svm)

CV accuracy of SVM: [0.94748359 0.94748359 0.94748359 0.94748359 0.94744526]


### Decision Tree

In [23]:
kfold = StratifiedKFold(n_splits=5).split(X_train, y_train)
pipe_tree_cv = make_pipeline(StandardScaler(), PCA(n_components=3),
                          DecisionTreeClassifier(criterion='gini', max_depth=2))
score_tree_cv = []
for k, (train, test) in enumerate(kfold):
    pipe_tree_cv.fit(X_train[train], y_train[train])
    score = pipe_tree_cv.score(X_train[test], y_train[test])
    score_tree_cv.append(score)
print('CV accuracy of Decision Tree:', score_tree_cv)

CV accuracy of Decision Tree: [0.9474835886214442, 0.9474835886214442, 0.9474835886214442, 0.9474835886214442, 0.9474452554744526]


In [24]:
cv_score_tree = cross_val_score(estimator=pipe_tree, X=X_train, y=y_train, cv=5)
print('CV accuracy of Decision Tree:', cv_score_tree)

CV accuracy of Decision Tree: [0.94748359 0.94748359 0.94748359 0.94748359 0.94744526]


## Grid search

### LR

In [25]:
from sklearn.model_selection import GridSearchCV

para_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
para_grid = [{'logisticregression__C': para_range, 
              'logisticregression__penalty': ['l1','l2']}]
gs_lr = GridSearchCV(estimator=pipe_lr, param_grid=para_grid, scoring='accuracy', cv=5)
gs_lr = gs_lr.fit(X_train, y_train)

In [26]:
print('%.4f' % gs_lr.best_score_)

0.9475


In [27]:
print(gs_lr.best_params_)

{'logisticregression__C': 0.0001, 'logisticregression__penalty': 'l1'}


### SVM

In [None]:
para_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
# para_grid = [{'svc__C': para_range, 'svc__kernel': ['linear', 'rbf']}]
para_grid = [{'svc__C': para_range, 'svc__kernel': ['linear']}]
gs_svm = GridSearchCV(estimator=pipe_svm, param_grid=para_grid, scoring='accuracy', cv=5)
gs_svm = gs_svm.fit(X_train, y_train)

In [None]:
print('%.4f' % gs_svm.best_score_)

In [None]:
print(gs_svm.best_params_)

### Decision Tree

In [28]:
para_depth = np.arange(1, 10)
para_grid = [{'decisiontreeclassifier__max_depth': para_depth,
               'decisiontreeclassifier__criterion': ['gini', 'entropy']}]
gs_tree = GridSearchCV(estimator=pipe_tree, param_grid=para_grid, scoring='accuracy', cv=5)
gs_tree = gs_tree.fit(X_train, y_train)

In [29]:
print('%.4f' % gs_tree.best_score_)

0.9475


In [30]:
print(gs_tree.best_params_)

{'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__max_depth': 1}
