In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay , classification_report , accuracy_score ,precision_recall_curve , roc_curve ,roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [11]:
df = pd.read_csv('./data.csv')
vars = df.iloc[:, 2:-1]
# vectorize the target data. M = 1 and B = 0 diagnosis
target = df.iloc[:, 1].apply(lambda x: 1 if x == 'M' else 0)

In [12]:
X = df.drop(['diagnosis'], axis=1)
y = df['diagnosis']
# Use ANOVA F-value as the scoring function for feature selection 
# f_classif is specify we using ANOVA
# k is the number of features we want to get
selector = SelectKBest(score_func=f_classif, k=15)
# Fit the selector to the data
selector.fit(vars, target)
# Get the scores and p-values of each feature
scores = selector.scores_
p_values = selector.pvalues_

results = pd.DataFrame({'Feature': vars.columns, 'Score': scores, 'p-value': p_values})
results.sort_values(by='Score', ascending=False, inplace=True)
# Select the top K features based on the scores
top_features = vars.columns[selector.get_support()]
# Display the top features
print(top_features)
print(results)

Index(['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean',
       'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se',
       'area_se', 'radius_worst', 'perimeter_worst', 'area_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst'],
      dtype='object')
                    Feature       Score        p-value
27     concave points_worst  964.385393  1.969100e-124
22          perimeter_worst  897.944219  5.771397e-119
7       concave points_mean  861.676020  7.101150e-116
20             radius_worst  860.781707  8.482292e-116
2            perimeter_mean  697.235272  8.436251e-101
23               area_worst  661.600206   2.828848e-97
0               radius_mean  646.981021   8.465941e-96
3                 area_mean  573.060747   4.734564e-88
6            concavity_mean  533.793126   9.966556e-84
26          concavity_worst  436.691939   2.464664e-72
5          compactness_mean  313.233079   3.938263e-56
25        compactness_worst  

In [13]:
vars = vars.loc[:,top_features]
vars

Unnamed: 0,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave points_mean,radius_se,perimeter_se,area_se,radius_worst,perimeter_worst,area_worst,compactness_worst,concavity_worst,concave points_worst
0,17.99,122.80,1001.0,0.27760,0.30010,0.14710,1.0950,8.589,153.40,25.380,184.60,2019.0,0.66560,0.7119,0.2654
1,20.57,132.90,1326.0,0.07864,0.08690,0.07017,0.5435,3.398,74.08,24.990,158.80,1956.0,0.18660,0.2416,0.1860
2,19.69,130.00,1203.0,0.15990,0.19740,0.12790,0.7456,4.585,94.03,23.570,152.50,1709.0,0.42450,0.4504,0.2430
3,11.42,77.58,386.1,0.28390,0.24140,0.10520,0.4956,3.445,27.23,14.910,98.87,567.7,0.86630,0.6869,0.2575
4,20.29,135.10,1297.0,0.13280,0.19800,0.10430,0.7572,5.438,94.44,22.540,152.20,1575.0,0.20500,0.4000,0.1625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,142.00,1479.0,0.11590,0.24390,0.13890,1.1760,7.673,158.70,25.450,166.10,2027.0,0.21130,0.4107,0.2216
565,20.13,131.20,1261.0,0.10340,0.14400,0.09791,0.7655,5.203,99.04,23.690,155.00,1731.0,0.19220,0.3215,0.1628
566,16.60,108.30,858.1,0.10230,0.09251,0.05302,0.4564,3.425,48.55,18.980,126.70,1124.0,0.30940,0.3403,0.1418
567,20.60,140.10,1265.0,0.27700,0.35140,0.15200,0.7260,5.772,86.22,25.740,184.60,1821.0,0.86810,0.9387,0.2650


In [14]:
# Stratify: split the training and testing by 80/20
train_features, test_features, train_targets, test_targets = train_test_split(vars, target, train_size=0.8,test_size=0.2,stratify = target)

In [15]:
display(train_features)
display(test_features)
display(train_targets)
display(test_targets)

Unnamed: 0,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave points_mean,radius_se,perimeter_se,area_se,radius_worst,perimeter_worst,area_worst,compactness_worst,concavity_worst,concave points_worst
476,14.20,92.41,618.4,0.11080,0.05063,0.030580,0.3478,2.749,31.01,16.45,112.10,828.5,0.34290,0.25120,0.13390
160,11.75,76.10,419.8,0.11410,0.06843,0.037380,0.5018,3.926,38.34,13.32,88.91,543.9,0.18920,0.19560,0.07909
255,13.96,91.43,602.4,0.12790,0.09789,0.052460,0.4250,2.563,35.74,16.39,108.10,826.0,0.32620,0.32090,0.13740
344,11.71,75.03,420.3,0.07281,0.04006,0.032500,0.3446,2.355,24.53,13.06,84.16,516.4,0.11150,0.10870,0.07864
200,12.23,78.54,461.0,0.08087,0.04187,0.041070,0.3534,2.308,27.24,14.44,92.15,638.4,0.20420,0.13770,0.10800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,10.03,63.19,307.3,0.03912,0.00247,0.005159,0.1851,1.184,11.60,11.11,69.92,376.3,0.07094,0.01235,0.02579
292,12.95,83.14,513.7,0.07943,0.06155,0.033700,0.2094,1.231,17.67,13.74,88.81,585.4,0.20680,0.22410,0.10560
38,14.99,95.54,698.8,0.05131,0.02398,0.028990,1.2140,8.077,106.00,14.99,95.54,698.8,0.05131,0.02398,0.02899
530,11.75,75.89,422.9,0.09713,0.05282,0.044400,0.4384,3.149,30.66,13.50,88.52,552.3,0.18540,0.13660,0.10100


Unnamed: 0,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave points_mean,radius_se,perimeter_se,area_se,radius_worst,perimeter_worst,area_worst,compactness_worst,concavity_worst,concave points_worst
429,12.720,80.98,501.3,0.04522,0.014020,0.018350,0.2954,2.109,23.240,13.820,88.87,586.8,0.09605,0.03469,0.03612
490,12.250,78.18,466.5,0.05200,0.017140,0.012610,0.2239,1.577,18.040,14.170,92.74,622.9,0.18040,0.12300,0.06335
371,15.190,97.65,711.8,0.06934,0.033930,0.026570,0.1783,1.338,17.720,16.200,104.50,819.1,0.17370,0.13620,0.08178
181,21.090,142.70,1311.0,0.28320,0.248700,0.149600,0.6298,4.414,81.460,26.680,176.50,2089.0,0.75840,0.67800,0.29030
537,11.690,76.37,406.4,0.15520,0.045150,0.045310,0.2957,2.158,20.950,12.980,86.12,487.7,0.32510,0.13950,0.13080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,13.500,85.69,566.2,0.03614,0.002758,0.004419,0.2244,1.509,20.390,14.970,95.48,698.7,0.05836,0.01379,0.02210
427,10.800,68.79,359.9,0.05743,0.036140,0.014040,0.3077,2.240,20.200,12.760,83.69,489.5,0.16960,0.19270,0.07485
46,8.196,51.71,201.9,0.05943,0.015880,0.005917,0.1563,1.094,8.205,8.964,57.26,242.2,0.13570,0.06880,0.02564
8,13.000,87.50,519.8,0.19320,0.185900,0.093530,0.3063,2.406,24.320,15.490,106.20,739.3,0.54010,0.53900,0.20600


476    0
160    0
255    1
344    0
200    0
      ..
425    0
292    0
38     1
530    0
361    0
Name: diagnosis, Length: 455, dtype: int64

429    0
490    0
371    0
181    1
537    0
      ..
308    0
427    0
46     0
8      1
23     1
Name: diagnosis, Length: 114, dtype: int64