In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

### Loading data and removing non-relevant columns

In [2]:
df = pd.read_csv('breast_cancer_data.csv')
df = df.drop('Unnamed: 32', axis = 1)
df = df.drop('id', axis = 1)

In [3]:
#we generate labels, denoted by Y (diagnosis column) and the rest of the data will be denoted by X
labels = df['diagnosis']
Y = [1 if ele == "M" else 0 for ele in labels] #convert labels M and B to binary(1 and 0)
df = df.drop('diagnosis', axis=1)#drop the diagnosis column from the dataframe

In [4]:
#shuffling the dataframe
df.sample(frac=1)
X = df

#splitting data into train and test
#this is done to test on samples that the model has not been trained on, it leads to less bias
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

### Logistic Regression

In [5]:
samples = []
clf = LogisticRegression().fit(X_train, Y_train)
Y_predicted_lr = clf.predict(X_test)
lr_accuracy = np.logical_xor(Y_predicted_lr, Y_test)
lr_accuracy = np.logical_not(lr_accuracy)
print("Train accuracy: ",clf.score(X_train, Y_train))
print("Test accuracy: ",clf.score(X_test, Y_test))
lr_accuracy_int = [1 if ele == True else 0 for ele in lr_accuracy]

Train accuracy:  0.9560439560439561
Test accuracy:  0.9385964912280702


### Decision Tree Classifier

In [6]:
clf_gini = DecisionTreeClassifier(criterion="gini").fit(X_train, Y_train)
Y_predicted_dt = clf_gini.predict(X_test)
dt_accuracy = np.logical_xor(Y_predicted_dt, Y_test)
dt_accuracy = np.logical_not(dt_accuracy)
print("Train accuracy: ",clf_gini.score(X_train, Y_train))
print("Test accuracy: ",clf_gini.score(X_test, Y_test))
dt_accuracy_int = [1 if ele == True else 0 for ele in dt_accuracy]

Train accuracy:  1.0
Test accuracy:  0.9298245614035088


### Random Forest Classifier 

In [7]:
samples = []
clf_random_forest = RandomForestClassifier().fit(X_train, Y_train)
Y_predicted_rf = clf_random_forest.predict(X_test)
rf_accuracy = np.logical_xor(Y_predicted_rf, Y_test)
rf_accuracy = np.logical_not(rf_accuracy)
print("Train accuracy: ",clf_random_forest.score(X_train, Y_train))
print("Test accuracy: ",clf_random_forest.score(X_test, Y_test))
rf_accuracy_int = [1 if ele == True else 0 for ele in rf_accuracy]

Train accuracy:  0.9956043956043956
Test accuracy:  0.956140350877193


### SVM

In [8]:
clf_svm = svm.SVC().fit(X_train, Y_train)
Y_predicted_svm = clf_svm.predict(X_test)
svm_accuracy = np.logical_xor(Y_predicted_svm, Y_test)
svm_accuracy = np.logical_not(svm_accuracy)
print("Train accuracy: ",clf_svm.score(X_train, Y_train))
print("Test accuracy: ",clf_svm.score(X_test, Y_test))
svm_accuracy_int = [1 if ele == True else 0 for ele in svm_accuracy]

Train accuracy:  1.0
Test accuracy:  0.6140350877192983
