## Breast cancer prediagnosis - machine learning

![alt text](https://lh5.googleusercontent.com/tQSIQYMMyNy4X-EsYF6WH2aA90otBV8VlBBairvc5XQegHh7lZ6JxESjD7WBWt71mGGMAYLlyLNPymN0UlyP-nlz1quGd59GO9_aaUtSQWNbh_yg8_3CNSERdO3S8dKFBggco_yN)

# **Load data**

**Download data**

In [1]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data

--2020-06-26 10:18:29--  https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 124103 (121K) [application/x-httpd-php]
Saving to: ‘wdbc.data’


2020-06-26 10:18:30 (290 KB/s) - ‘wdbc.data’ saved [124103/124103]



In [11]:
import pandas as pd

column_names = [ "id", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", "concave_points_mean","symmetry_mean",
                "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave points_se", "symmetry_se",
                "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave points_worst",
                "symmetry_worst", "fractal_dimension_worst" ]
                
data = pd.read_csv("wdbc.data", names=column_names, header=None, index_col=[0])

**Count uniformity and homogenity**

In [12]:
data['uniformity'] = data.apply(lambda row: row.radius_worst - row.radius_mean, axis=1)
data['homogenity'] = data.apply(lambda row: row.symmetry_worst - row.symmetry_mean, axis=1)

In [13]:
data.sample(5)

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,uniformity,homogenity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
926682,M,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,0.7655,2.463,5.203,99.04,0.005769,0.02423,0.0395,0.01678,0.01898,0.002498,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,3.56,0.082
889403,M,15.61,19.38,100.0,758.6,0.0784,0.05616,0.04209,0.02847,0.1547,0.05443,0.2298,0.9988,1.534,22.18,0.002826,0.009105,0.01311,0.005174,0.01013,0.001345,17.91,31.67,115.9,988.6,0.1084,0.1807,0.226,0.08568,0.2683,0.06829,2.3,0.1136
8711803,M,19.19,15.94,126.3,1157.0,0.08694,0.1185,0.1193,0.09667,0.1741,0.05176,1.0,0.6336,6.971,119.3,0.009406,0.03055,0.04344,0.02794,0.03156,0.003362,22.03,17.81,146.6,1495.0,0.1124,0.2016,0.2264,0.1777,0.2443,0.06251,2.84,0.0702
871641,B,11.08,14.71,70.21,372.7,0.1006,0.05743,0.02363,0.02583,0.1566,0.06669,0.2073,1.805,1.377,19.08,0.01496,0.02121,0.01453,0.01583,0.03082,0.004785,11.35,16.82,72.01,396.5,0.1216,0.0824,0.03938,0.04306,0.1902,0.07313,0.27,0.0336
894618,M,20.16,19.66,131.1,1274.0,0.0802,0.08564,0.1155,0.07726,0.1928,0.05096,0.5925,0.6863,3.868,74.85,0.004536,0.01376,0.02645,0.01247,0.02193,0.001589,23.06,23.03,150.2,1657.0,0.1054,0.1537,0.2606,0.1425,0.3055,0.05933,2.9,0.1127


In [14]:
data['diagnosis'] = [1 if x == "B" else 0 for x in data['diagnosis']]

In [15]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import fbeta_score, recall_score, precision_score, accuracy_score

In [16]:
split_number = 10
kf = KFold(n_splits=split_number)
sum_f2 = 0
sum_precision = 0
sum_recall = 0
sum_accu = 0
for index, (train_index, test_index) in enumerate(kf.split(data)):
  train = data.iloc[train_index]
  test = data.iloc[test_index]

  #Train
  train_X = train.drop('diagnosis', axis=1)
  train_y = train['diagnosis']
  #model = MLPClassifier(hidden_layer_sizes=(128,),max_iter=155000,batch_size=120,random_state=2020,learning_rate_init=0.0001,n_iter_no_change=50)
  model = AdaBoostClassifier()
  model.fit(train_X, train_y)

  #Test
  test_X = test.drop('diagnosis', axis=1)
  test_y = test['diagnosis']

  predicted_y = model.predict(test_X)

  f2 = fbeta_score(test_y, predicted_y, average='macro', beta=2.0)
  recall = recall_score(test_y, predicted_y)
  precision = precision_score(test_y, predicted_y)
  accuracy = accuracy_score(test_y, predicted_y)

  sum_f2 += f2
  sum_precision += precision
  sum_recall += recall
  sum_accu += accuracy

print("Accuracy: %f" % (sum_accu / split_number))
print("Recall: %f" %  (sum_recall / split_number))
print("Precision: %f" %  (sum_precision / split_number))
print("---------------------------")
print("F2.0: %f" % (5 * ((sum_precision / split_number) * (sum_recall / split_number)) / (4 * (sum_precision / split_number) + (sum_recall / split_number))))


Accuracy: 0.975407
Recall: 0.988060
Precision: 0.964128
---------------------------
F2.0: 0.983179
