In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import f1_score

In [22]:
import warnings
warnings.filterwarnings('ignore')

### Loading data and removing non-relevant columns

In [23]:
df = pd.read_csv('breast_cancer_data.csv')
df = df.drop('Unnamed: 32', axis = 1)
df = df.drop('id', axis = 1)

In [24]:
#we generate labels, denoted by Y (diagnosis column) and the rest of the data will be denoted by X
labels = df['diagnosis']
Y = [1 if ele == "M" else 0 for ele in labels] #convert labels M and B to binary(1 and 0)
df = df.drop('diagnosis', axis=1)#drop the diagnosis column from the dataframe

In [25]:
#shuffling the dataframe
df.sample(frac=1)
X = df

#splitting data into train and test
#this is done to test on samples that the model has not been trained on, it leads to less bias
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

### Logistic Regression

In [30]:
clf = LogisticRegression().fit(X_train, Y_train)
Y_predicted_lr = clf.predict(X_test)
print("Test F1-score: ", f1_score(Y_predicted_lr, Y_test))

Test F1-score:  0.945945945945946


### Decision Tree Classifier

In [31]:
clf_gini = DecisionTreeClassifier(criterion="gini").fit(X_train, Y_train)
Y_predicted_dt = clf_gini.predict(X_test)
print("Test F1-score: ", f1_score(Y_predicted_dt, Y_test))

Test F1-score:  0.8571428571428572


### Random Forest Classifier 

In [32]:
clf_random_forest = RandomForestClassifier().fit(X_train, Y_train)
Y_predicted_rf = clf_random_forest.predict(X_test)
print("Test F1-score: ", f1_score(Y_predicted_rf, Y_test))

Test F1-score:  0.945945945945946


### SVM

In [33]:
clf_svm = svm.SVC().fit(X_train, Y_train)
Y_predicted_svm = clf_svm.predict(X_test)
print("Test F1-score: ", f1_score(Y_predicted_svm, Y_test))

Test F1-score:  0.0
