# Importing Libraries

In [7]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Loading data and specifying variabels

In [8]:
cancer_data = load_breast_cancer()

(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

# or we could use these instead:
'''
X_cancer = cancer_data.data

y_cancer = cancer_data.target
'''

print "Size of X_cancer = ", X_cancer.shape

print "Size of y_cancer = ", y_cancer.shape

Size of X_cancer =  (569, 30)
Size of y_cancer =  (569,)


## Data Split

In [9]:
from sklearn.model_selection import train_test_split as SPLIT

X_train, X_test, y_train, y_test = SPLIT(X_cancer, y_cancer, random_state = 0)

# 1) Decision Tree

## Model Development 

In [10]:
from sklearn.tree import DecisionTreeClassifier

my_tree_classifier = DecisionTreeClassifier().fit(X_train, y_train)

## Prediction and Evaluation

In [11]:
from sklearn.metrics import accuracy_score

y_pred = my_tree_classifier.predict(X_test)

y_pred_train = my_tree_classifier.predict(X_train)

print "Accuracy Score on Test Data = ", accuracy_score(y_test, y_pred)
print "Accuracy Score on Train Data = ", accuracy_score(y_train, y_pred_train)

Accuracy Score on Test Data =  0.8951048951048951
Accuracy Score on Train Data =  1.0


## Pre-pruning to avoid Overfitting

It seems that we have overfitting problem.

So, we are going to use *max_dept* and *min_samples_leaf* to restric the complexity of the tree



In [49]:
my_tree_classifier_2 = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8, random_state = 0).fit(X_train, y_train)

## Prediction and Evaluation

In [50]:
y_pred = my_tree_classifier_2.predict(X_test)

y_pred_train = my_tree_classifier_2.predict(X_train)

print "Accuracy Score on Test Data = ", accuracy_score(y_test, y_pred)
print "Accuracy Score on Train Data = ", accuracy_score(y_train, y_pred_train)

Accuracy Score on Test Data =  0.937062937063
Accuracy Score on Train Data =  0.964788732394


# 2) Random Forest

## Model Development

In [44]:
# import RandomForestClassifier from sklearn.ensemble module

from sklearn.ensemble import RandomForestClassifier

k = 8

My_Random_Forest = RandomForestClassifier(max_features = k, random_state = 0)

My_Random_Forest = My_Random_Forest.fit(X_train, y_train)

# Prediction and Evaluation

In [46]:
y_pred = My_Random_Forest.predict(X_test)

y_pred_train = My_Random_Forest.predict(X_train)

print "Accuracy Score on Test Data = ", accuracy_score(y_test, y_pred)
print "Accuracy Score on Train Data = ", accuracy_score(y_train, y_pred_train)

Accuracy Score on Test Data =  0.986013986014
Accuracy Score on Train Data =  0.995305164319


### How to set k (max_features)?

A good rule of thumb is to set **k = sqrt(number_of_features)**, and then try few higher values and lower values

In [45]:
# Try it with different k

print "Features of our Data = ", cancer_data.feature_names
print
print "Size of features = ", len(cancer_data.feature_names)

# Try with different n_estimators as well

Features of our Data =  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

Size of features =  30
