# Breast Cancer Analysis with Support Vector Machine

A simple script using SVM for calculating if a breast cancer tumor is malignant or begnin

### Imports

In [1]:
import sklearn
import numpy
import time
from sklearn import svm, metrics
from sklearn.model_selection import GridSearchCV
from pandas import read_csv 
from multiprocessing import Pool
import matplotlib.pyplot as plt
import seaborn as sns
from run_svm import run_svm

### Config Parameters

In [2]:
LOOP_COUNT = 10
SHOW_PLOT = False
MULTIPLIER = 10
LINE = "---------------------------------------------------"


KERNEL_LIST = ['rbf', 'sigmoid', 'linear']                      # List of kernels to test, poly is not included as it is used extra with different degrees
C_LIST = [0.1, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]
GAMMA_LIST = ['scale']
# GAMMA_LIST = ['scale', 'auto']
DEGREE_LIST = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

### print_pretty()
Function for printing output with lines inbetween.

In [3]:
def print_pretty(*args):
    for arg in args:
        print(arg)
    print(LINE)

### load_data()
Function for loading data from a csv and saving it in a data-frame for further use.

In [4]:
def load_data():
    data_frame = read_csv('breast-cancer.csv')

    data = data_frame.drop(labels = ['diagnosis'], axis = 1)
    data = data.drop(labels = "id", axis = 1)                       # Dropping id column, because it is not needed
    target = data_frame['diagnosis'].replace({'M': 1, 'B': 0})

    # Standardize data

    # target = scaler.fit_transform(target)

    return data_frame, data, target

### find_best_params()
Uses GridSearchCV for finding the best parameters for the SVM (Hyper-Parameter-Tuning)

In [5]:
def find_best_params(data, target):
    grid_params = [{
            'kernel': ['poly'], 
            'C': C_LIST, 
            'gamma':GAMMA_LIST,
            'degree': DEGREE_LIST},
            {
            'kernel': KERNEL_LIST,
            'C': C_LIST,
            'gamma':GAMMA_LIST}
        ]
    clf = GridSearchCV(svm.SVC(),grid_params, n_jobs=-1, verbose=3)

    data_train, data_test, target_train, target_test = sklearn.model_selection.train_test_split(data, target, test_size=0.25, shuffle=True)
    clf.fit(data_train, target_train)

    return clf.best_params_

### plot_data()
Function for ploting the first 5 columns of the given data-frame.

In [6]:
def plot_data(data_frame):
    if (SHOW_PLOT):
        sns.pairplot(                   # Plotting data of first 5 columns
            data_frame, 
            hue="diagnosis", 
            vars=["radius_mean", 
                "texture_mean", 
                "perimeter_mean", 
                "area_mean", 
                "smoothness_mean"])        
        plt.show()                      # Showing plot

### drop_columns()
Tests the accuracy with one feature dropped for all features. If the accuracy is lower than the default one the column is added to an array, which is returned at the end.

In [7]:
def drop_columns(data, target, default_acc, best_params):

    # Create necessary arrays
    feature_arr = []                                # Array for multiprocessing
    dropped_columns = []                            # Array for dropped columns

    for counter, column in enumerate(data):
        feature_arr.append((data.drop(labels = [column], axis = 1), target,
            default_acc, column, False, LOOP_COUNT, best_params))

    with Pool() as p:
        dropped_columns.append(p.starmap(run_svm, feature_arr))
        dropped_columns = dropped_columns[0]

    dropped_columns = [x for x in dropped_columns if x is not None]

    return dropped_columns

### Main Function
Runs all necessary code

In [8]:
if(__name__ == "__main__"):

    begin = time.time()

    print_pretty("\n\n\nStarting")

    # Getting data
    data_frame, data, target = load_data()                          # Getting data from csv file

    # Plot example data from first 5 columns
    plot_data(data_frame)

    best_params = find_best_params(data, target)                    # Finding best params for SVM with GridSearchCV (hyperparameter tuning)
    print_pretty(best_params)
        
    default_acc = run_svm(                                          # Getting accuracy with all features
        data,
        target,
        return_mean = True, 
        repeats=LOOP_COUNT * MULTIPLIER, 
        params=best_params)

    print_pretty(
        "Accuracy with all features: ", 
        default_acc)

    dropped_columns = drop_columns(                                 # Getting columns to be dropped for final accuracy
        data,
        target, 
        default_acc, 
        best_params)

    print_pretty("Dropped Columns: ", dropped_columns)

    final_data = data.drop(labels = dropped_columns, axis = 1)

    final_acc = run_svm(                                            # Getting accuracy with dropped features
        final_data, 
        target, 
        return_mean = True, 
        repeats=MULTIPLIER * LOOP_COUNT, 
        params = best_params)

    print_pretty("Accuracy with dropped features: ", final_acc)

    end = time.time()
    print_pretty("Done. Time Taken: ", end - begin)




Starting
---------------------------------------------------
Fitting 5 folds for each of 143 candidates, totalling 715 fits
{'C': 5, 'gamma': 'scale', 'kernel': 'linear'}
---------------------------------------------------
