In [1]:
# setup + import
from utils import *
import os
import sklearn
import pandas as pd
import pickle

# setup + import
data = pd.read_csv('../data_processed/breast-cancer-diagnostic.shuf.lrn.csv')
data.head()

pickle_file = open('../data_processed/breast-cancer_column_types.pkl', 'rb')
feature_structure = pickle.load(pickle_file)
pickle_file.close()

In [2]:
feature_columns = feature_structure['bin'] + feature_structure['cat'] + feature_structure['cont'] + feature_structure['ord']

feature_structure["cat"]
TARGET = feature_structure["target"]


X = data[feature_columns]
y = data[TARGET]

values = [v  for _,v in feature_structure.items()]
# pd.DataFrame({"type":[feature_structure.keys()], "columns":  values} )
pd.DataFrame({"type":feature_structure.keys(), "columns":  values} )

Unnamed: 0,type,columns
0,bin,[]
1,cat,[]
2,ord,[]
3,cont,"[radiusMean, textureMean, perimeterMean, ar..."
4,target,class


In [6]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

classifiers0 = [
    SVC(kernel='linear', random_state=42, decision_function_shape='ovo', probability=True),
    KNeighborsClassifier(n_neighbors=3, n_jobs=-1),
    DecisionTreeClassifier(random_state=42),
]

classifier1 = [ KNeighborsClassifier(n_jobs=-1)]
model1 = get_pipeline(feature_structure,classifier1[0])

# single model training
res_holdout, model_holdout = perform_holdout(X, y, model1, 42)
res_cv, model_cv = perform_cv(X, y, model1)

results = append_results({}, model_holdout, model_cv, res_holdout, res_cv)
pd.DataFrame(results)

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,KNeighborsClassifier(n_jobs=-1)_Holdout,0.982456,0.982972,0.982456,0.9824,0.013
1,KNeighborsClassifier(n_jobs=-1)_CV,0.964912,0.966011,0.964912,0.964515,0.107997


Evaluating multiple models

In [8]:
evaluate_models(data, feature_structure, classifiers0)

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"SVC(decision_function_shape='ovo', kernel='lin...",1.0,1.0,1.0,1.0,0.010004
1,"SVC(decision_function_shape='ovo', kernel='lin...",0.978947,0.979292,0.978947,0.978852,0.06951
2,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)...",0.982456,0.982972,0.982456,0.9824,0.01
3,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)_CV",0.964912,0.965999,0.964912,0.964377,0.075022
4,DecisionTreeClassifier(random_state=42)_Holdout,1.0,1.0,1.0,1.0,0.012026
5,DecisionTreeClassifier(random_state=42)_CV,0.922807,0.923679,0.922807,0.921534,0.062996


Comparison of SVC over kernel functions:

In [15]:

classifiersSVC = [
    SVC(kernel='linear', random_state=42),
    SVC(kernel='rbf', random_state=42),
    SVC(kernel='poly', random_state=42),
    SVC(kernel='sigmoid', random_state=42),
]

resultsSVC = evaluate_models(data, feature_structure, classifiersSVC)
resultsSVC

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"SVC(kernel='linear', random_state=42)_Holdout",1.0,1.0,1.0,1.0,0.017001
1,"SVC(kernel='linear', random_state=42)_CV",0.978947,0.979292,0.978947,0.978852,0.058
2,SVC(random_state=42)_Holdout,1.0,1.0,1.0,1.0,0.006998
3,SVC(random_state=42)_CV,0.975439,0.975902,0.975439,0.975201,0.058002
4,"SVC(kernel='poly', random_state=42)_Holdout",0.929825,0.937411,0.929825,0.92864,0.006
5,"SVC(kernel='poly', random_state=42)_CV",0.926316,0.93435,0.926316,0.923461,0.059002
6,"SVC(kernel='sigmoid', random_state=42)_Holdout",1.0,1.0,1.0,1.0,0.005
7,"SVC(kernel='sigmoid', random_state=42)_CV",0.947368,0.948766,0.947368,0.946938,0.061327


Comparison of KNeighbours over k 

In [12]:
classifierKneighbors = [ KNeighborsClassifier(n_jobs=-1, n_neighbors=k) for k in range(2, 13)]
resultsKNeighbors = evaluate_models(data, feature_structure, classifierKneighbors)
resultsKNeighbors

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)...",0.964912,0.966917,0.964912,0.964666,0.010999
1,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)_CV",0.947368,0.950313,0.947368,0.946136,0.083998
2,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)...",0.982456,0.982972,0.982456,0.9824,0.009998
3,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)_CV",0.964912,0.965999,0.964912,0.964377,0.075
4,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)...",0.947368,0.951754,0.947368,0.946758,0.010001
5,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)_CV",0.954386,0.956431,0.954386,0.95357,0.078
6,KNeighborsClassifier(n_jobs=-1)_Holdout,0.982456,0.982972,0.982456,0.9824,0.01
7,KNeighborsClassifier(n_jobs=-1)_CV,0.964912,0.966011,0.964912,0.964515,0.076
8,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)...",0.982456,0.982972,0.982456,0.9824,0.01
9,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)_CV",0.964912,0.965925,0.964912,0.964468,0.073003


Decision Tree comparison over max_deph

In [17]:
classifierDecisionTree = [ DecisionTreeClassifier(random_state=42, max_depth=depth) for depth in range(3, 15)]
resultsDecisionTree = evaluate_models(data, feature_structure, classifierDecisionTree)
resultsDecisionTree

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"DecisionTreeClassifier(max_depth=3, random_sta...",0.982456,0.982972,0.982456,0.9824,0.008999
1,"DecisionTreeClassifier(max_depth=3, random_sta...",0.933333,0.93628,0.933333,0.932908,0.067
2,"DecisionTreeClassifier(max_depth=4, random_sta...",0.982456,0.982972,0.982456,0.9824,0.007
3,"DecisionTreeClassifier(max_depth=4, random_sta...",0.926316,0.929981,0.926316,0.925769,0.06
4,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.007
5,"DecisionTreeClassifier(max_depth=5, random_sta...",0.919298,0.92063,0.919298,0.917634,0.061002
6,"DecisionTreeClassifier(max_depth=6, random_sta...",0.982456,0.982972,0.982456,0.9824,0.008
7,"DecisionTreeClassifier(max_depth=6, random_sta...",0.915789,0.916569,0.915789,0.914473,0.062007
8,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.007
9,"DecisionTreeClassifier(max_depth=7, random_sta...",0.922807,0.923679,0.922807,0.921534,0.063


# Competition classification

In [None]:
from sklearn.svm import SVC
import pandas as pd

test_data = pd.read_csv('../data_processed/breast-cancer-diagnostic.shuf.tes.csv')
T = test_data[feature_columns]

classifier = SVC(kernel='linear', random_state=42, decision_function_shape='ovo', probability=True)
model = get_pipeline(feature_structure,classifier)

# training
res_cv, model_cv = perform_cv(X, y, model)
# prediction
pred = model_cv.predict(T)

with open('../data_processed/kaggle-breast-cancer-prediction.csv', 'w', newline='') as file:
    for id_value, pred_value in zip(test_data["ID"], pred):
        file.write(f"{id_value},{pred_value}\n")
