In [1]:
#Import libraries

import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split 
import pandas as pd
from sklearn.impute import SimpleImputer

In [2]:
# Load the breast-cancer-wisconsin dataset

url="https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
missing_values=["?"]
df=pd.read_csv(url,header=None,na_values=missing_values)
columns=["id", "ClumpThickness", "Uniformity_of_Cell_Size", "Uniformity_of_Cell_Shape", "Marginal_Adhesion" 
         ,"Single_Epithelial_Cell_Size" ,"Bare_Nuclei" ,"Bland_Chromatin" ,"Normal_Nucleoli", "Mitoses","Class"]
df.columns=columns

In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           699 non-null    int64  
 1   ClumpThickness               699 non-null    int64  
 2   Uniformity_of_Cell_Size      699 non-null    int64  
 3   Uniformity_of_Cell_Shape     699 non-null    int64  
 4   Marginal_Adhesion            699 non-null    int64  
 5   Single_Epithelial_Cell_Size  699 non-null    int64  
 6   Bare_Nuclei                  683 non-null    float64
 7   Bland_Chromatin              699 non-null    int64  
 8   Normal_Nucleoli              699 non-null    int64  
 9   Mitoses                      699 non-null    int64  
 10  Class                        699 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 60.2 KB


Unnamed: 0,id,ClumpThickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2


In [4]:
df.Uniformity_of_Cell_Size==df.Uniformity_of_Cell_Size

0      True
1      True
2      True
3      True
4      True
       ... 
694    True
695    True
696    True
697    True
698    True
Name: Uniformity_of_Cell_Size, Length: 699, dtype: bool

In [5]:
#Split Dataset

Y=df.Class
X=df.drop(["id","Class","Uniformity_of_Cell_Shape"],axis=1)

X_training,X_test,Y_training,Y_test=train_test_split(X,Y,test_size=0.33)

In [6]:
#Impute missing values

from sklearn.impute import SimpleImputer

simpleImpute=SimpleImputer()
X_training=simpleImpute.fit_transform(X_training)
X_test=simpleImpute.transform(X_test)

In [7]:
#Standardization 

from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_training_stand=ss.fit_transform(X_training)
X_test_stand=ss.transform(X_test)

# Decision Tree

In [8]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

decisionTree=DecisionTreeClassifier()
decisionTree.fit(X_training,Y_training)

Y_test_pred=decisionTree.predict(X_test)

In [9]:
# Accuracy Score

from sklearn.metrics import accuracy_score

print(accuracy_score(Y_test,Y_test_pred))

0.948051948051948


In [10]:
# Drawing and Exporting the Decision Tree

import graphviz 
from sklearn.tree import export_graphviz
from sklearn import tree


feature_names=[ "ClumpThickness", "Uniformity_of_Cell_Size", "Marginal_Adhesion" 
         ,"Single_Epithelial_Cell_Size" ,"Bare_Nuclei" ,"Bland_Chromatin" ,"Normal_Nucleoli", "Mitoses"]




dot_data = tree.export_graphviz(decisionTree, out_file=None, 
                                feature_names = feature_names,class_names=['2','4'],
                                filled=True, rounded=True,  
                                special_characters=True)  
graph = graphviz.Source(dot_data)  
graph

#This will create an iris.pdf file with the rule path

graph.render("DecisionTree_Breast_Cancer")

'DecisionTree_Breast_Cancer.pdf'

# Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

randomForest=RandomForestClassifier()
randomForest.fit(X_training,Y_training)
y_pred=randomForest.predict(X_test)

print(accuracy_score(Y_test,y_pred))
pd.DataFrame(
    confusion_matrix(Y_test, y_pred),
    columns=['0', '1'],
    index=['0', '1']
)

0.9696969696969697


Unnamed: 0,0,1
0,143,5
1,2,81


# Gradient Boosting

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

gradientBoostingClassifier=GradientBoostingClassifier()
gradientBoostingClassifier.fit(X_training,Y_training)

Y_test_pred_GBoost=gradientBoostingClassifier.predict(X_test)
print(accuracy_score(Y_test,Y_test_pred_GBoost))
pd.DataFrame(
    confusion_matrix(Y_test, Y_test_pred_GBoost),
    columns=['0', '1'],
    index=['0', '1']
)

0.961038961038961


Unnamed: 0,0,1
0,143,5
1,4,79


# Support Vector Machine (SVM)

In [13]:
from sklearn.svm import  SVC

#Standardizing
from sklearn.preprocessing import StandardScaler

ss=StandardScaler()
X_training_stand=ss.fit_transform(X_training)
X_test_stand=ss.transform(X_test)

Linear SVM

In [14]:
svm=SVC(kernel="linear")
svm.fit(X_training_stand,Y_training)
Y_test_pred_svm=svm.predict(X_test_stand)
print(accuracy_score(Y_test,Y_test_pred_svm))

pd.DataFrame(
    confusion_matrix(Y_test, Y_test_pred_svm),
    columns=['0', '1'],
    index=['0', '1']
)

0.961038961038961


Unnamed: 0,0,1
0,143,5
1,4,79


Non-Linear SVM

In [15]:
# To perform a nonlinear svm we need a nonlinear kernel (e.g. polynomial kernel) 

svm_rbf=SVC(kernel="poly")
svm_rbf.fit(X_training_stand,Y_training)
Y_test_pred_svm_rbf=svm_rbf.predict(X_test_stand)
print(accuracy_score(Y_test,Y_test_pred_svm_rbf))

pd.DataFrame(
    confusion_matrix(Y_test, Y_test_pred_svm_rbf),
    columns=['0', '1'],
    index=['0', '1']
)

0.9437229437229437


Unnamed: 0,0,1
0,145,3
1,10,73
