<a href="https://colab.research.google.com/github/Nicordaro/MML/blob/main/MML_Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Mathematics in Machine Learning
#Tesina
Nicolò Cordaro s272145

---



###Organize imports and dataset definition

In [None]:
!pip install dtreeviz
import os
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import shutil
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.inspection import permutation_importance
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from dtreeviz.trees import *

scoring = make_scorer(f1_score)

import warnings
warnings.filterwarnings("ignore")

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

SEED = 42

def confmat(y_test,pred):
    cf_matrix = confusion_matrix(y_test, pred)
    ax = sns.heatmap(cf_matrix, annot=True,  fmt='g', cmap=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True), xticklabels=["benign","malign"], yticklabels=["benign","malign"]) #notation: "annot" not "annote"
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    ax.set(xlabel='True Label', ylabel='Predicted Label')

In [None]:
tune = False
viz = False
pca_flag = False

In [None]:
# Clone github repository with dataset
# deleting if already present(Only for development purposes, colab limitations)
!rm -rf MML
!rm -rf dataset

if not os.path.isdir('./MML'):
  !git clone https://github.com/Nicordaro/MML
    
source_dir = './MML/dataset'
target_dir = './dataset'
    
file_names = os.listdir(source_dir)
os.mkdir(target_dir)
for file_name in file_names:
  print(file_name)
  shutil.move(source_dir+"/"+file_name, target_dir+"/"+file_name)

In [None]:
names = {"sampcode":"Sample code number", 
"clump": "Clump Thickness", 
"uni_size":"Uniformity of Cell Size", 
"uni_shape":"Uniformity of Cell Shape", 
"adhesion":"Marginal Adhesion", 
"epith_size":"Single Epithelial Cell Size", 
"nuclei":"Bare Nuclei", 
"chromatin":"Bland Chromatin", 
"nucleoli":"Normal Nucleoli", 
"mitoses":"Mitoses", 
"class":"Class"}

In [None]:
dataset_DF = pd.read_csv(target_dir+"/breast-cancer-wisconsin.data", names=list(names.keys()))
dataset_DF.iloc[:, 10] = dataset_DF.iloc[:, 10].replace(2, "benign").replace(4, "malign")
dataset_DF.info()

# Inspection & Preprocessing

In [None]:
for col in dataset_DF.iloc[:, 1:].columns:
  print("Attribute '"+str(col)+"'\t contains: \t"+str(sorted(dataset_DF[col].unique())))

In [None]:
# Non-numerical values are present in the Bare-nuclei feature
dataset_DF.isin(['?']).any()

In [None]:
# searching non-numeric ("?") occurrances in the dataset
dataset_DF = dataset_DF.drop(dataset_DF[dataset_DF["nuclei"]=="?"].index)
# conversion to numeric the object column
dataset_DF.iloc[:, 6] = pd.to_numeric(dataset_DF.iloc[:, 6])

In [None]:
for col in dataset_DF.iloc[:, 1:].columns:
  print("Attribute '"+str(col)+"'\t contains: \t"+str(sorted(dataset_DF[col].unique())))

In [None]:
dataset_DF.info()

In [None]:
dataset_DF.head(3)

In [None]:
#Rimozione della feature sampcode, non utile alla fine dell'analisi che si vuole portare avanti
dataset_DF = dataset_DF.drop("sampcode", axis=1)
dataset_DF.describe()

In [None]:
# Description of only benign cancer occurances in the dataset
dataset_DF.loc[dataset_DF['class'] == "benign"].describe()

In [None]:
# Description of only malign cancer occurances in the dataset
dataset_DF.loc[dataset_DF['class'] == "malign"].describe()

In [None]:
encoded = pd.concat([dataset_DF.iloc[:, 0:9], dataset_DF["class"].replace("benign", 0).replace("malign", 1)], axis=1)
encoded

# Visualization

In [None]:
fig, ax1 = plt.subplots(figsize=(20,10), facecolor = "white")
col_dict = {"benign":"#81c784", "malign": "#e57373"}
graph = sns.countplot(dataset_DF["class"], palette=col_dict);
graph.set_xticklabels(graph.get_xticklabels())
for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center");

In [None]:
f,ax = plt.subplots(figsize=(15,15), facecolor="white")
mask = np.triu(np.ones_like(encoded.corr(), dtype=bool))
sns.heatmap(encoded.corr(), annot=True, linewidths=0.5, fmt='.2f', cmap='Reds', ax=ax, square=True, mask=mask);

In [None]:
plt.subplots(3, 3, figsize = (20, 20), facecolor = "white")

for i, col in enumerate(dataset_DF.columns[:9]):
  plt.subplot(3, 3, i+1)
  sns.histplot(dataset_DF.loc[dataset_DF['class'] == 'benign', col], stat="probability", label='benign', color="#81c784", element="bars", discrete=True, kde=True)
  sns.histplot(dataset_DF.loc[dataset_DF['class'] == 'malign', col], stat="probability", label='malign', color="#e57373", element="bars", discrete=True, kde=True)
  plt.title(col)
  plt.xticks(np.arange(1,10,1))
  plt.legend(loc='upper right')

In [None]:
if viz:
  plt.subplots(3, 3, figsize = (20, 20), facecolor = "white")
  for i, col in enumerate(dataset_DF.columns[:9]):
    plt.subplot(3, 3, i+1)
    sns.swarmplot(y=dataset_DF.iloc[:, i] , x="class", size=2, palette=col_dict, data=dataset_DF);
    plt.xlabel("Type of Cancer", size=14)

In [None]:
if viz:
  sns.set_style("ticks")
  sns.pairplot(dataset_DF, hue = "class", diag_kind="kde", height=2, palette=col_dict)

# Processing

In [None]:
#splitting data and target dataframe
x = dataset_DF.reset_index(drop=True)
x_unscaled = x
seed = 42
x_unscaled

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x.iloc[:, :9])
x_scaled

In [None]:
scaled_DF = pd.DataFrame(data=x_scaled, columns=['clump', 'uni_size', 'uni_shape',	'adhesion',	'epith_size',	'nuclei',	'chromatin', 'nucleoli',	'mitoses'])
scaled_DF_with_labels = pd.concat((scaled_DF ,(dataset_DF.iloc[:, 9].reset_index(drop=True))), axis=1)

In [None]:
scaled_DF

In [None]:
plt.figure(figsize=(15,11))
ax = sns.boxplot(data=scaled_DF_with_labels, palette='colorblind')

In [None]:
x = scaled_DF_with_labels.iloc[:, :9]
y = scaled_DF_with_labels.iloc[:, 9].replace("benign", 0).replace("malign", 1)

In [None]:

from sklearn.decomposition import PCA

pca = PCA(random_state=SEED).fit(x)
plt.figure(figsize=(15,11))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('number of components')
plt.ylabel('proportion of variance explained');
plt.yticks(np.arange(0, 1.1, 0.1))
plt.grid()
plt.legend(['Cumulative Variance', 'Component Variance'])
print(np.cumsum(pca.explained_variance_ratio_))


In [None]:
pca = PCA(n_components = 5)
x_pca = pca.fit_transform(x)

In [None]:
x_pca_DF = pd.DataFrame(data=x_pca)
x_pca_DF

In [None]:
#without pca performances are worsened bc
#The PCA is a change of variables, using the correlations explained by orthogonal directions.
#Removing directions with non-representative corresponding correlation is like removing noise. You will only keep significant data.

In [None]:
x

In [None]:
y

# Classification

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(x, y, test_size=0.30, random_state=SEED,stratify=y)
if pca_flag:
  X_train_val, X_test, y_train_val, y_test = train_test_split(x_pca_DF, y, test_size=0.30, random_state=SEED,stratify=y)

In [None]:
final_report = {'LoRe': [], 'DTree': [], 'RF': [], 'KNN': [], 'SVM': []}

##Logistic Regression
---

In [None]:
#To avoid overfitting we use KFold Stratified (stratidication given class imbalance)
# Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. 
# In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold.

kf = StratifiedKFold(n_splits=10, shuffle=True)

# StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=SEED)
    X_train, y_train = sm.fit_resample(X_train, y_train)

    #Classification
    loRe = LogisticRegression(solver='liblinear', multi_class='ovr', C=50)
    loRe.fit(X_train, y_train)
    pred = loRe.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
# results on the test set
clf = LogisticRegression(solver='liblinear', multi_class='ovr', C=50)

sm = SMOTE(random_state=SEED)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)

pred = clf.predict(X_test)
final_report['LoRe'].append(accuracy_score(pred,y_test))
print(f"Accuracy = {accuracy_score(pred,y_test)}")
final_report['LoRe'].append(f1_score(pred,y_test))
print(f"F1 = {f1_score(pred,y_test)}")
final_report['LoRe'].append(precision_score(pred,y_test))
print(f"Precision = {precision_score(pred,y_test)}")
final_report['LoRe'].append(recall_score(pred,y_test))
print(f"Recall = {recall_score(pred,y_test)}")
plt.figure(figsize=(15,11), )
confmat(y_test,pred)
print(classification_report(y_test, pred))

In [None]:
if not pca_flag:
  importance = clf.coef_[0]
  plt.figure(figsize=(15,11))
  # summarize feature importance
  names = ['clump', 'uni_size', 'uni_shape',	'adhesion',	'epith_size',	'nuclei',	'chromatin', 'nucleoli',	'mitoses']
  plt.xticks(range(len(names)), names)
  for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
  # plot feature importance
  plt.bar([x for x in range(len(importance))], importance)

Decision Tree CLF
---

In [None]:
kf = StratifiedKFold(n_splits=10, shuffle=True)

# StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=SEED)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    
    #Classification
    dTree = DecisionTreeClassifier(max_depth=3)
    dTree.fit(X_train, y_train)
    pred = dTree.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
# results on the test set
clf = DecisionTreeClassifier(max_depth=3)

sm = SMOTE(random_state=SEED)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)

pred = clf.predict(X_test)
final_report['DTree'].append(accuracy_score(pred,y_test))
print(f"Accuracy = {accuracy_score(pred,y_test)}")
final_report['DTree'].append(f1_score(pred,y_test))
print(f"F1 = {f1_score(pred,y_test)}")
final_report['DTree'].append(precision_score(pred,y_test))
print(f"Precision = {precision_score(pred,y_test)}")
final_report['DTree'].append(recall_score(pred,y_test))
print(f"Recall = {recall_score(pred,y_test)}")
plt.figure(figsize=(15,11))

confmat(y_test, pred)

In [None]:
feature_names = ['clump', 'uni_size', 'uni_shape',	'adhesion',	'epith_size',	'nuclei',	'chromatin', 'nucleoli',	'mitoses']
class_names = ['benign', 'malign']

plt.figure(figsize=(28,43))
plot_tree(clf, feature_names=feature_names, class_names=class_names)
#viz = dtreeviz(clf, X_test, pred, target_name="target", feature_nameds=feature_names, class_names=class_names)

#viz
#viz.save("decision_tree.svg")

In [None]:
viz = dtreeviz(clf, X_test, pred, target_name="target", feature_names=feature_names, class_names=class_names, orientation='LR', scale=2)

viz.save('decision_tree.svg')

In [None]:
if not pca_flag:
  importance = clf.feature_importances_
  plt.figure(figsize=(15,11))
  names = ['clump', 'uni_size', 'uni_shape',	'adhesion',	'epith_size',	'nuclei',	'chromatin', 'nucleoli',	'mitoses']
  plt.xticks(range(len(names)), names)
  # summarize feature importance
  for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
  # plot feature importance
  plt.bar([x for x in range(len(importance))], importance)

Random Forest CLF
---

In [None]:
if tune:
  #List Hyperparameters that we want to tune.
  estimators_list = [50, 70, 90, 100, 110, 130, 140]
  features_list = ["log2", "sqrt", "auto"]
  sample_leafs_list = [2, 4, 8, 10]
  #Convert to dictionary
  hyperparameters = dict(n_estimators=estimators_list, max_features=features_list, min_samples_leaf = sample_leafs_list)
  #Create new KNN object
  rfc = RandomForestClassifier(max_depth=3, n_jobs=-1)
  #Use GridSearch
  clf = GridSearchCV(rfc, hyperparameters, cv=10, n_jobs=-1, scoring = scoring)
  #Fit the model
  best_model = clf.fit(x_pca,y)
  #Print The value of best Hyperparameters
  print('Best estimator:', best_model.best_estimator_.get_params()['n_estimators'])
  print('Best feature:', best_model.best_estimator_.get_params()['max_features'])
  print('Best sample leaf:', best_model.best_estimator_.get_params()['min_samples_leaf'])

In [None]:
#To avoid overfitting we use KFold Stratified (stratidication given class imbalance)
# Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. 
# In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold.

kf = StratifiedKFold(n_splits=10, shuffle=True)

# StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]

    #SMOTE
    sm = SMOTE(random_state=SEED)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    
    #Classification
    clf = RandomForestClassifier(max_depth = 3, n_estimators = 90, min_samples_leaf = 10, max_features='sqrt', random_state = SEED, n_jobs=-1)
    if tune:
      clf = RandomForestClassifier(max_depth = 3, n_jobs=-1, n_estimators = best_model.best_estimator_.get_params()['n_estimators'], min_samples_leaf = best_model.best_estimator_.get_params()['min_samples_leaf'], max_features=best_model.best_estimator_.get_params()['max_features'], random_state = 42)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
# results on the test set
clf = RandomForestClassifier(max_depth = 3, n_estimators = 90, min_samples_leaf = 10, max_features='sqrt', random_state = SEED, n_jobs=-1)
if tune:
  clf = RandomForestClassifier(n_estimators = best_model.best_estimator_.get_params()['n_estimators'], min_samples_leaf = best_model.best_estimator_.get_params()['min_samples_leaf'], max_features=best_model.best_estimator_.get_params()['max_features'], random_state = 42)

sm = SMOTE(random_state=SEED)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)

pred = clf.predict(X_test)
final_report['RF'].append(accuracy_score(pred,y_test))
print(f"Accuracy = {accuracy_score(pred,y_test)}")
final_report['RF'].append(f1_score(pred,y_test))
print(f"F1 = {f1_score(pred,y_test)}")
final_report['RF'].append(precision_score(pred,y_test))
print(f"Precision = {precision_score(pred,y_test)}")
final_report['RF'].append(recall_score(pred,y_test))
print(f"Recall = {recall_score(pred,y_test)}")
plt.figure(figsize=(15,11))
confmat(y_test, pred)

In [None]:
if not pca_flag:
  importance = clf.feature_importances_
  plt.figure(figsize=(15,11))
  # summarize feature importance
  names = ['clump', 'uni_size', 'uni_shape',	'adhesion',	'epith_size',	'nuclei',	'chromatin', 'nucleoli',	'mitoses']
  plt.xticks(range(len(names)), names)
  for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
  # plot feature importance
  plt.bar([x for x in range(len(importance))], importance)

KNeighbors CLF
---

In [None]:
if tune:
  #List Hyperparameters that we want to tune.
  leaf_size = list(range(1,50))
  n_neighbors = list(range(1,30))
  algorithm_list = ['auto', 'ball_tree', 'kd_tree', 'brute']
  p=[1,2,3]
  #Convert to dictionary
  hyperparameters = dict(leaf_size=leaf_size, algorithm = algorithm_list, n_neighbors=n_neighbors, p=p)
  #Create new KNN object
  knn = KNeighborsClassifier()
  #Use GridSearch
  clf = GridSearchCV(knn, hyperparameters, cv=10, n_jobs=-1, scoring= scoring)
  #Fit the model
  best_model = clf.fit(x_pca,y)
  #Print The value of best Hyperparameters
  print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
  print('Best p:', best_model.best_estimator_.get_params()['p'])
  print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

In [None]:
#To avoid overfitting we use KFold Stratified (stratidication given class imbalance)
# Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. 
# In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold.

kf = StratifiedKFold(n_splits=10, shuffle=True)

# StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=SEED)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    
    #Classification
    knn = KNeighborsClassifier(leaf_size=1, p=1, n_neighbors=15)
    if tune:
      knn = KNeighborsClassifier(leaf_size=best_model.best_estimator_.get_params()['leaf_size'], p=best_model.best_estimator_.get_params()['p'], n_neighbors=best_model.best_estimator_.get_params()['n_neighbors'])
    
    knn.fit(X_train, y_train)
    pred = knn.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
# results on the test set
clf = KNeighborsClassifier(leaf_size=1, p=1, n_neighbors=15)
if tune:
  clf = KNeighborsClassifier(leaf_size=best_model.best_estimator_.get_params()['leaf_size'], p=best_model.best_estimator_.get_params()['p'], n_neighbors=best_model.best_estimator_.get_params()['n_neighbors'])

sm = SMOTE(random_state=SEED)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)

pred = knn.predict(X_test)
final_report['KNN'].append(accuracy_score(pred,y_test))
print(f"Accuracy = {accuracy_score(pred,y_test)}")
final_report['KNN'].append(f1_score(pred,y_test))
print(f"F1 = {f1_score(pred,y_test)}")
final_report['KNN'].append(precision_score(pred,y_test))
print(f"Precision = {precision_score(pred,y_test)}")
final_report['KNN'].append(recall_score(pred,y_test))
print(f"Recall = {recall_score(pred,y_test)}")
plt.figure(figsize=(15,11))
confmat(y_test, pred)

In [None]:
if not pca_flag:
  plt.figure(figsize=(15,11))
  results = permutation_importance(clf, X_res, y_res, scoring='accuracy')
  # get importance
  importance = results.importances_mean
  names = ['clump', 'uni_size', 'uni_shape',	'adhesion',	'epith_size',	'nuclei',	'chromatin', 'nucleoli',	'mitoses']
  # summarize feature importance
  for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
  # plot feature importance
  plt.xticks(range(len(names)), names)
  plt.bar([x for x in range(len(importance))], importance)

Support Vector Machine CLF
---

In [None]:
if (tune):
  #List Hyperparameters that we want to tune.
  C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
  G_list = [0.0001,0.001,0.005,0.01,0.05,0.1,0.5,1,10,100,1000]
  kernel_list = ['linear']
  #Convert to dictionary
  hyperparameters = dict(C=C_list, gamma=G_list, kernel = kernel_list)
  #Create new KNN object
  svc_CLF = SVC()
  #Use GridSearch
  clf = GridSearchCV(svc_CLF, hyperparameters, cv=10, n_jobs=-1, scoring = scoring)
  #Fit the model
  best_model = clf.fit(x_pca,y)
  #Print The value of best Hyperparameters
  print('Best C:', best_model.best_estimator_.get_params()['C'])
  print('Best gamma:', best_model.best_estimator_.get_params()['gamma'])
  print('Best kernel:', best_model.best_estimator_.get_params()['kernel'])

In [None]:
#To avoid overfitting we use KFold Stratified (stratidication given class imbalance)
# Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. 
# In such cases it is recommended to use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative class frequencies is approximately preserved in each train and validation fold.

kf = StratifiedKFold(n_splits=10, shuffle=True)

# StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

scores = []
for train_index, test_index in kf.split(X_train_val,y_train_val):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[test_index]
    y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[test_index]
    
    #SMOTE
    sm = SMOTE(random_state=SEED)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    
    #Classification
    clf = SVC(C=1, gamma=0.0001, kernel='linear')
    if tune:
      clf = SVC(C=best_model.best_estimator_.get_params()['C'], gamma=best_model.best_estimator_.get_params()['gamma'], kernel=best_model.best_estimator_.get_params()['kernel'])
    
    clf.fit(X_train, y_train)
    pred = clf.predict(X_val)
    scores.append(f1_score(pred,y_val))
print(f"mean score {np.mean(scores)}")

In [None]:
# results on the test set
clf = SVC(C=1, gamma=0.0001, kernel='linear')
if tune:
  clf = SVC(C=best_model.best_estimator_.get_params()['C'], gamma=best_model.best_estimator_.get_params()['gamma'], kernel=best_model.best_estimator_.get_params()['kernel'])

sm = SMOTE(random_state=SEED)
X_res, y_res = sm.fit_resample(X_train_val, y_train_val)
clf.fit(X_res,y_res)

pred = clf.predict(X_test)
final_report['SVM'].append(accuracy_score(pred,y_test))
print(f"Accuracy = {accuracy_score(pred,y_test)}")
final_report['SVM'].append(f1_score(pred,y_test))
print(f"F1 = {f1_score(pred,y_test)}")
final_report['SVM'].append(precision_score(pred,y_test))
print(f"Precision = {precision_score(pred,y_test)}")
final_report['SVM'].append(recall_score(pred,y_test))
print(f"Recall = {recall_score(pred,y_test)}")
plt.figure(figsize=(15,11))
confmat(y_test, pred)

In [None]:
if not pca_flag:
  importance = clf.coef_[0]
  plt.figure(figsize=(15,11))
  # summarize feature importance
  names = ['clump', 'uni_size', 'uni_shape',	'adhesion',	'epith_size',	'nuclei',	'chromatin', 'nucleoli',	'mitoses']
  for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
    plt.xticks(range(len(names)), names)
    # plot feature importance
  plt.bar([x for x in range(len(importance))], importance)

# Final Report

In [None]:
final_report

In [None]:
accuracies = []
f1s = []
precisions = []
recalls = []
models = []
for el in final_report.keys():
  models.append(el)
  accuracies.append(final_report[el][0])
  f1s.append(final_report[el][1])
  precisions.append(final_report[el][2])
  recalls.append(final_report[el][3])

In [None]:
scores = pd.DataFrame(list(zip(accuracies, f1s, precisions,recalls)), 
               columns =['accuracy', 'f1', 'precision','recall'], index = models)
scores

In [None]:
for i in ['accuracy', 'f1', 'precision','recall']:
  plt.figure(figsize=(15,11))
  ax = sns.barplot(x=models,y=scores[i],palette='icefire')
  for p in ax.patches:
    ax.annotate("{:.4f}".format(p.get_height(), 'f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
  
  plt.show()