# Classification Mid-term Project: Wisconsin Breast Cancer
- Dataset from UCI repository
- https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)

## 0. Load libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

## 1. Load the dataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
names = ['Code', 'Clump-Thickness', 'Cell-Size', 'Cell-Shape', 'Adhesion', 'Single-Cell-Size', 'Bare-Nuclei', 'Chromatin', 'Nucleoli', 'Mitoses', 'Class']
dataset = pd.read_csv(url, names=names)

In [None]:
dataset.head(5)

## 2. Data preprocessing 

In [None]:
# delete code
dataset.drop('Code', axis=1, inplace=True)
# mark ? as NaN
dataset['Bare-Nuclei'] = dataset['Bare-Nuclei'].replace('?', np.NaN)
# Convert the type of 'Bare-Nuclei' column as float
dataset['Bare-Nuclei'] = dataset['Bare-Nuclei'].astype(float)
# Drop NaN
dataset.dropna(axis=0, how='any', inplace=True)

In [None]:
dataset.head(5)

## 3. Data summarization

In [None]:
# shape
print(dataset.shape)

In [None]:
# types
print(dataset.dtypes)

In [None]:
# descriptions, change precision to 3 places
pd.set_option('precision', 3)
pd.set_option('display.width', 200)
print(dataset.describe())

In [None]:
# class distribution
print(dataset.groupby('Class').size())

In [None]:
# correlation
print(dataset.corr(method='pearson'))

## 4. Data visualization

In [None]:
# Determine the size of figures made by pyplot
plt.rcParams['figure.figsize'] = (15,15)

In [None]:
# histograms
dataset.hist()
plt.show()

In [None]:
# density
dataset.plot(kind='density', subplots=True, layout=(4,3), sharex=False)
plt.show()

In [None]:
# box plots
dataset.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False)
plt.show()

In [None]:
# scatter plot matrix
scatter_matrix(dataset)
plt.show()

In [None]:
# correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(dataset.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
ticks = np.arange(0,10,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names[1:11])
ax.set_yticklabels(names[1:11])
plt.show()

## 5.  Preparation

In [None]:
# Split-out validation dataset
array = dataset.values
X = array[:,0:9]
Y = array[:,9]
validation_size = 0.20
seed = 15
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [None]:
# Evaluate Algorithms
# Test options and evaluation metric
num_folds = 10
num_instances = len(X_train)
seed = 7
scoring = 'accuracy'

## 6. Learn models

### Base models

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))

In [None]:
models

In [None]:
results = []
names = []
for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "[%s]\tmean: %f\tstd: %f" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()


### Base models with standardization

In [None]:
# Standardize the dataset
models = []
models.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())])))
models.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())])))
models.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
models.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
models.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
models.append(('ScaledRF', Pipeline([('Scaler', StandardScaler()),('RF', RandomForestClassifier())])))

In [None]:
models

In [None]:
results = []
names = []
for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "[%s]\tmean: %f\tstd: %f" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Base models with normalization

In [None]:
# Normalize the dataset
models = []
models.append(('ScaledLR', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('LR', LogisticRegression())])))
models.append(('ScaledLDA', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('LDA', LinearDiscriminantAnalysis())])))
models.append(('ScaledKNN', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('KNN', KNeighborsClassifier())])))
models.append(('ScaledCART', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('CART', DecisionTreeClassifier())])))
models.append(('ScaledNB', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('NB', GaussianNB())])))
models.append(('ScaledRF', Pipeline([('Scaler', MinMaxScaler(feature_range=(0, 1))),('RF', RandomForestClassifier())])))
results = []
names = []
for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "[%s]\tmean: %f\tstd: %f" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Parameter tuning: k-NN classifier

In [None]:
# Tune scaled-KNN
# scaler = StandardScaler().fit(X_train)
# rescaledX = scaler.transform(X_train)

model = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])

neighbors = [1,3,5,7,9,11,13,15,17,19,21]
param_grid = {'KNN__n_neighbors':neighbors} # Pipeline을 이용한 경우, 어떤 모델의 파라미터를 조정할 것인지 정의해야만 함

kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

In [None]:
fine_tuned_scaled_KNN = grid_result.best_estimator_
print(fine_tuned_scaled_KNN)

## Exercise 1
- GridSearchCV를 이용하여 RandomForestClassifier의 성능을 높여보세요.
- 변화를 줄 parameter: n_estimators, max_features
- 각 parameter의 후보군은 각자 세팅해보시기 바랍니다.

In [None]:
model = RandomForestClassifier()

n_estimators_set = [5, 10, 15, 20, 25, 30, 35, 40]
max_features_set = ["sqrt", "log2", None]
param_grid = dict(n_estimators = n_estimators_set,
                  max_features = max_features_set)
# param_grid = {'n_estimators': n_estimators_set,
#               'max_features': max_features_set}

kfold = cross_validation.KFold(n = num_instances, n_folds = num_folds, random_state = seed)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

In [None]:
fine_tuned_RF = grid_result.best_estimator_
fine_tuned_RF.fit(X_train, Y_train)
fine_tuned_RF.feature_importances_

In [None]:
dataset.columns[0:-1]

In [None]:
importances = pd.Series(fine_tuned_RF.feature_importances_, index = dataset.columns[0:-1])
importances.plot(kind='bar')

## Exercise 2
- 지금까지의 모델링은 Train set을 cross-validation한 부분까지 완성되어 있습니다.
- 만들어진 모델을 Test set에 적용하여 예측성능을 평가해주세요.

In [None]:
tuned_models = []
tuned_models.append(('Tuned_scaled_KNN', fine_tuned_scaled_KNN))
tuned_models.append(('Tuned_RF', fine_tuned_RF))

In [None]:
tuned_models

In [None]:
cms = []
scores = []
names = []
for name, model in tuned_models:
    Y_test_hat = model.predict(X_test)
    cms.append(confusion_matrix(Y_test, Y_test_hat))
    scores.append(accuracy_score(Y_test, Y_test_hat))
    names.append(name)

In [None]:
for name, score, cm in list(zip(names, scores, cms)):
    print('\n[%s]' % name)
    print('- test accuracy: %f' % score)
    print('- cm :\n', cm)

## Support Vector Classifier

In [None]:
from sklearn.svm import SVC

In [None]:
?SVC

In [None]:
model = SVC()

C_set = [1, 10, 50, 100]
kernel_set = ["linear", "poly", "rbf"]
degree_set = [2, 3, 4, 5]
gamma_set = [0.01, 0.05, 0.1, 0.5]

param_grid = dict(C=C_set, 
                  kernel=kernel_set, 
                  degree=degree_set, 
                  gamma=gamma_set)

kfold = cross_validation.KFold(n = num_instances, n_folds = num_folds, random_state = seed)
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

## Ensembles

Scikit-learn에서 기본적으로 제공하는 앙상블 모델
- BaggingClassifier
- ExtraTreesClassifier
- AdaBoostClassifier
- GradientBoostingClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [None]:
ensembles = []
ensembles.append(
    ('SimpleBaggingTree', 
     BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                       n_estimators=50)
    )
)

In [None]:
ensembles.append(
    ('ExtraTrees', 
     ExtraTreesClassifier(n_estimators=50,
                          max_features=5)
    )
)

In [None]:
ensembles.append(
    ('AdaBoost', 
     AdaBoostClassifier(n_estimators=50)
    )
)

In [None]:
ensembles.append(
    ('GradientBoosting',
     GradientBoostingClassifier(n_estimators=50)
    )
)

In [None]:
ensembles

In [None]:
for name, model in ensembles:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "[%s]\tmean: %f\tstd: %f" % (name, cv_results.mean(), cv_results.std())
    print(msg)

- Voting classifier: 여러 개의 모델을 하나의 데이터셋에 학습한 후, 투표

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
estimators = []
estimators.append(('Logit', LogisticRegression(penalty='l2', C=1)))
estimators.append(('KNN', KNeighborsClassifier(n_neighbors=3)))
estimators.append(('CART', DecisionTreeClassifier(max_depth=4)))
estimators.append(('NB', GaussianNB()))
estimators.append(('RF', RandomForestClassifier(n_estimators=50)))

In [None]:
voting_ensemble = VotingClassifier(estimators,
                                   voting='soft')

In [None]:
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cv_results = cross_validation.cross_val_score(voting_ensemble, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
msg = "[%s]\tmean: %f\tstd: %f" % ('[voting_ensemble]', cv_results.mean(), cv_results.std())
print(msg)

- XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(n_estimators = 100,
                    learning_rate = 0.05,
                    nthread = -1)

In [None]:
param_grid = {
    'max_depth': [3, 4, 5],
    'gamma': [0, 0.5, 1]
}

In [None]:
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
grid = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

- Stacking: 구현되어 있지 않아서, 직접 짜서 써야 합니다.

앞서 parameter search를 수행한 두 모델 fine_tuned_scaled_KNN과 fine_tuned_RF를 사용하도록 합시다

<Training phase>

Training phase

In [None]:
estimators = [fine_tuned_scaled_KNN, fine_tuned_RF]

In [None]:
predicted_values = []
for estimator in estimators:
    Y_train_hat = estimator.predict(X_train)
    predicted_values.append(Y_train_hat)

In [None]:
new_X_train = np.asarray(list(zip(predicted_values[0], predicted_values[1])))

In [None]:
meta_estimator = LogisticRegression(penalty='l2', C=1)
meta_estimator.fit(new_X_train, Y_train)
Y_train_hat = meta_estimator.predict(new_X_train)

In [None]:
confusion_matrix(Y_train, Y_train_hat)

Test phase

In [None]:
predicted_values = []
for estimator in estimators:
    Y_test_hat = estimator.predict(X_test)
    predicted_values.append(Y_test_hat)

In [None]:
new_X_test = np.asarray(list(zip(predicted_values[0], predicted_values[1])))

In [None]:
Y_test_hat = meta_estimator.predict(new_X_test)

In [None]:
confusion_matrix(Y_test, Y_test_hat)