# Importing libraries

In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from io import StringIO 
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, Normalizer, scale
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Importing Dataset

In [None]:
df = pd.DataFrame(load_breast_cancer()['data'],columns=load_breast_cancer()['feature_names'])
df['y'] = load_breast_cancer()['target']

# Data Preprocessing

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['y'].sum()

In [None]:
feature_cols = df.columns
feature_cols = feature_cols.drop('y')

In [None]:
X = df.iloc[:,:-1] # Features
y = df.iloc[:,-1] # Target variable

# Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Decision Tree

In [None]:
cfr = DecisionTreeClassifier()

cfr = cfr.fit(X_train,y_train)

y_pred_cfr = cfr.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred_cfr))

In [None]:
dot_data = StringIO()
export_graphviz(cfr, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('dtc.png')
Image(graph.create_png())

In [None]:
y_pred_cfr = cfr.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_cfr)
roc_auc = auc(fpr, tpr)

print ('Accuracy Score is %.4f' % (accuracy_score(y_test, y_pred_cfr)))
print ('Classification Report : \n', classification_report(y_test, y_pred_cfr))

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'y', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Tuning the hyper-parameters

In [None]:
cfr2 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 10, min_samples_leaf = 10)

cfr2 = cfr2.fit(X_train,y_train)

y_pred_cfr2 = cfr2.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred_cfr2))

In [None]:
dot_data = StringIO()
export_graphviz(cfr2, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('dtc1.png')
Image(graph.create_png())

In [None]:
y_pred_cfr2 = cfr2.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_cfr2)
roc_auc = auc(fpr, tpr)

print ('Accuracy Score is %.4f' % (accuracy_score(y_test, y_pred_cfr2)))
print ('Classification Report : \n', classification_report(y_test, y_pred_cfr2))

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'y', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Random Forest

In [None]:
rfc = RandomForestClassifier(random_state=0, warm_start = True)
rfc.fit(X_train, y_train)

In [None]:
pred_test_rfc = rfc.predict(X_test)

In [None]:
y_pred_rfc = rfc.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_rfc)
roc_auc = auc(fpr, tpr)

print ('Accuracy Score is %.4f' % (accuracy_score(y_test, y_pred_rfc)))
print ('Classification Report : \n', classification_report(y_test, y_pred_rfc))

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'y', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Gradient Boosting Method

In [None]:
gbm = GradientBoostingClassifier(learning_rate=0.1)
gbm.fit(X_train,y_train)
y_pred_gbm = gbm.predict(X_test)

In [None]:
y_pred_gbm = gbm.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_gbm)
roc_auc = auc(fpr, tpr)

print ('Accuracy Score is %.4f' % (accuracy_score(y_test, y_pred_gbm)))
print ('Classification Report : \n', classification_report(y_test, y_pred_gbm))

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'y', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# K Nearest Neighbour

In [None]:
knc = KNeighborsClassifier(n_neighbors=7)

knc.fit(X_train, y_train)

y_pred_knc = knc.predict(X_test)

In [None]:
y_pred_knc = knc.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_knc)
roc_auc = auc(fpr, tpr)

print ('Accuracy Score is %.4f' % (accuracy_score(y_test, y_pred_knc)))
print ('Classification Report : \n', classification_report(y_test, y_pred_knc))

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'y', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()