In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
# Load Dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
names = ['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape',
       'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei',
       'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']
df = pd.read_csv(url, names=names)
df.replace('?',-99999, inplace=True)
print(df.axes)
df.drop(['id'], 1, inplace=True)

[RangeIndex(start=0, stop=699, step=1), Index([u'id', u'clump_thickness', u'uniform_cell_size', u'uniform_cell_shape',
       u'marginal_adhesion', u'single_epithelial_size', u'bare_nuclei',
       u'bland_chromatin', u'normal_nucleoli', u'mitoses', u'class'],
      dtype='object')]


In [3]:
df.head()

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [52]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation

In [91]:
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [92]:
scalar = StandardScaler()

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)

from sklearn import svm
svm = svm.SVC()

from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier(n_estimators=100, max_depth=2)

models = {"knn":neigh, "svm":svm, "random_forest":rf}

In [93]:
kfold = KFold(n_splits=10, random_state = 42)
for key in models.keys():
    cv_results = cross_val_score(models[key], X_train, y_train, cv=kfold, scoring='accuracy')
    msg = "%s: %f (%f)" % (key, cv_results.mean(), cv_results.std())
    print(msg)

knn: 0.971396 (0.024208)
svm: 0.958896 (0.021169)
random_forest: 0.964188 (0.017957)


In [94]:
param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'n_estimators': [3, 10], 'max_features': [2, 3, 4], 'bootstrap': [False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [95]:
print "Best model:",grid_search.best_estimator_

Best model: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [96]:
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

0.955 (+/-0.041) for {'max_features': 2, 'n_estimators': 3}
0.964 (+/-0.032) for {'max_features': 2, 'n_estimators': 10}
0.975 (+/-0.026) for {'max_features': 2, 'n_estimators': 30}
0.953 (+/-0.042) for {'max_features': 4, 'n_estimators': 3}
0.970 (+/-0.024) for {'max_features': 4, 'n_estimators': 10}
0.970 (+/-0.024) for {'max_features': 4, 'n_estimators': 30}
0.957 (+/-0.031) for {'max_features': 6, 'n_estimators': 3}
0.962 (+/-0.029) for {'max_features': 6, 'n_estimators': 10}
0.970 (+/-0.024) for {'max_features': 6, 'n_estimators': 30}
0.964 (+/-0.011) for {'max_features': 8, 'n_estimators': 3}
0.966 (+/-0.021) for {'max_features': 8, 'n_estimators': 10}
0.964 (+/-0.020) for {'max_features': 8, 'n_estimators': 30}
0.959 (+/-0.031) for {'max_features': 2, 'n_estimators': 3, 'bootstrap': False}
0.968 (+/-0.014) for {'max_features': 2, 'n_estimators': 10, 'bootstrap': False}
0.962 (+/-0.040) for {'max_features': 3, 'n_estimators': 3, 'bootstrap': False}
0.962 (+/-0.035) for {'max_feat

In [97]:
feature_importances = grid_search.best_estimator_.feature_importances_
names = df.columns.values
sorted(zip(feature_importances, names), reverse=True)

[(0.2634334314411394, 'uniform_cell_size'),
 (0.21471463529689652, 'uniform_cell_shape'),
 (0.16186572866340476, 'bare_nuclei'),
 (0.10226503558501471, 'bland_chromatin'),
 (0.08526962991805422, 'normal_nucleoli'),
 (0.08374184647100459, 'single_epithelial_size'),
 (0.06127138245253099, 'clump_thickness'),
 (0.02239958289532065, 'marginal_adhesion'),
 (0.005038727276634207, 'mitoses')]

In [131]:
final_model = RandomForestClassifier(max_features=4, n_estimators=30)
final_model.fit(X_train, y_train)
print "Test set accuracy:",final_model.score(X_test, y_test)

Test set accuracy: 0.9642857142857143


In [136]:
from sklearn.metrics import confusion_matrix, classification_report
pred = final_model.predict(X_test)
print "confusion_matrix"
print confusion_matrix(y_test, pred)
print "classification_report"
print classification_report(y_test, pred)
 

confusion_matrix
[[90  4]
 [ 1 45]]
classification_report
             precision    recall  f1-score   support

          2       0.99      0.96      0.97        94
          4       0.92      0.98      0.95        46

avg / total       0.97      0.96      0.96       140

