In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay , classification_report , accuracy_score ,precision_recall_curve , roc_curve ,roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [43]:
df = pd.read_csv('./data.csv')
vars = df.iloc[:, 2:-1]
# vectorize the target data. M = 1 and B = 0 diagnosis
target = df.iloc[:, 1].apply(lambda x: 1 if x == 'M' else 0)

In [50]:
X = vars
y = target
# Use ANOVA F-value as the scoring function for feature selection 
# f_classif is specify we using ANOVA
# k is the number of features we want to get
selector = SelectKBest(score_func=f_classif, k=15)
# Fit the selector to the data
selector.fit(vars, target)
# Get the scores and p-values of each feature
scores = selector.scores_
p_values = selector.pvalues_

results = pd.DataFrame({'Feature': vars.columns, 'Score': scores, 'p-value': p_values})
results.sort_values(by='Score', ascending=False, inplace=True)
# Select the top K features based on the scores
top_features = vars.columns[selector.get_support()]
# Display the top features
print(top_features)
print(results)

Index(['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean',
       'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se',
       'area_se', 'radius_worst', 'perimeter_worst', 'area_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst'],
      dtype='object')
                    Feature       Score        p-value
27     concave points_worst  964.385393  1.969100e-124
22          perimeter_worst  897.944219  5.771397e-119
7       concave points_mean  861.676020  7.101150e-116
20             radius_worst  860.781707  8.482292e-116
2            perimeter_mean  697.235272  8.436251e-101
23               area_worst  661.600206   2.828848e-97
0               radius_mean  646.981021   8.465941e-96
3                 area_mean  573.060747   4.734564e-88
6            concavity_mean  533.793126   9.966556e-84
26          concavity_worst  436.691939   2.464664e-72
5          compactness_mean  313.233079   3.938263e-56
25        compactness_worst  

In [30]:
X = vars.loc[:,top_features]
vars

Unnamed: 0,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave points_mean,radius_se,perimeter_se,area_se,radius_worst,perimeter_worst,area_worst,compactness_worst,concavity_worst,concave points_worst
0,17.99,122.80,1001.0,0.27760,0.30010,0.14710,1.0950,8.589,153.40,25.380,184.60,2019.0,0.66560,0.7119,0.2654
1,20.57,132.90,1326.0,0.07864,0.08690,0.07017,0.5435,3.398,74.08,24.990,158.80,1956.0,0.18660,0.2416,0.1860
2,19.69,130.00,1203.0,0.15990,0.19740,0.12790,0.7456,4.585,94.03,23.570,152.50,1709.0,0.42450,0.4504,0.2430
3,11.42,77.58,386.1,0.28390,0.24140,0.10520,0.4956,3.445,27.23,14.910,98.87,567.7,0.86630,0.6869,0.2575
4,20.29,135.10,1297.0,0.13280,0.19800,0.10430,0.7572,5.438,94.44,22.540,152.20,1575.0,0.20500,0.4000,0.1625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,142.00,1479.0,0.11590,0.24390,0.13890,1.1760,7.673,158.70,25.450,166.10,2027.0,0.21130,0.4107,0.2216
565,20.13,131.20,1261.0,0.10340,0.14400,0.09791,0.7655,5.203,99.04,23.690,155.00,1731.0,0.19220,0.3215,0.1628
566,16.60,108.30,858.1,0.10230,0.09251,0.05302,0.4564,3.425,48.55,18.980,126.70,1124.0,0.30940,0.3403,0.1418
567,20.60,140.10,1265.0,0.27700,0.35140,0.15200,0.7260,5.772,86.22,25.740,184.60,1821.0,0.86810,0.9387,0.2650


In [31]:
# Stratify: split the training and testing by 80/20
train_features, test_features, train_targets, test_targets = train_test_split(vars, target, train_size=0.8,test_size=0.2,stratify = target)

In [32]:
display(train_features)
display(test_features)
display(train_targets)
display(test_targets)

Unnamed: 0,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave points_mean,radius_se,perimeter_se,area_se,radius_worst,perimeter_worst,area_worst,compactness_worst,concavity_worst,concave points_worst
177,16.460,109.30,832.9,0.15560,0.17930,0.08866,0.3037,2.482,31.59,17.79,123.50,981.2,0.46670,0.58620,0.20350
169,14.970,96.22,685.9,0.07885,0.02602,0.03781,0.2713,1.893,24.28,16.11,104.60,793.7,0.16370,0.06648,0.08485
4,20.290,135.10,1297.0,0.13280,0.19800,0.10430,0.7572,5.438,94.44,22.54,152.20,1575.0,0.20500,0.40000,0.16250
504,9.268,61.49,248.7,0.22390,0.09730,0.05252,0.4076,3.014,20.04,10.28,69.05,300.2,0.34410,0.20990,0.10250
90,14.620,94.57,662.7,0.08606,0.03102,0.02957,0.3721,2.279,33.76,16.11,102.90,803.7,0.17660,0.09189,0.06946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,12.210,78.78,462.0,0.07823,0.06839,0.02534,0.2666,2.097,19.96,13.13,87.65,529.9,0.24310,0.30760,0.09140
490,12.250,78.18,466.5,0.05200,0.01714,0.01261,0.2239,1.577,18.04,14.17,92.74,622.9,0.18040,0.12300,0.06335
391,8.734,55.27,234.3,0.07428,0.00000,0.00000,0.5169,3.167,28.85,10.17,64.01,317.0,0.13100,0.00000,0.00000
482,13.470,87.32,546.3,0.11550,0.05786,0.05266,0.1588,1.102,12.84,14.83,94.94,660.2,0.24990,0.18480,0.13350


Unnamed: 0,radius_mean,perimeter_mean,area_mean,compactness_mean,concavity_mean,concave points_mean,radius_se,perimeter_se,area_se,radius_worst,perimeter_worst,area_worst,compactness_worst,concavity_worst,concave points_worst
2,19.69,130.00,1203.0,0.15990,0.19740,0.127900,0.7456,4.585,94.03,23.57,152.50,1709.0,0.42450,0.45040,0.24300
456,11.63,74.87,415.1,0.08574,0.07160,0.020170,0.3135,2.150,23.13,13.12,86.04,527.8,0.20310,0.29230,0.06835
433,18.82,123.70,1110.0,0.13890,0.15940,0.087440,0.8191,4.493,103.90,22.66,145.30,1603.0,0.34630,0.39120,0.17080
143,12.90,83.74,512.2,0.09509,0.04894,0.030880,0.2143,1.689,16.64,14.48,97.17,643.8,0.25480,0.20900,0.10120
95,20.26,132.40,1264.0,0.13130,0.14650,0.086830,0.7576,4.554,87.87,24.22,156.10,1750.0,0.35390,0.40980,0.15730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,12.30,78.83,463.7,0.07253,0.03844,0.016540,0.2382,1.687,18.32,13.35,86.65,546.7,0.16500,0.14230,0.04815
494,13.16,84.06,538.7,0.05275,0.01800,0.012560,0.3237,2.326,26.07,14.50,95.29,648.3,0.16460,0.07698,0.04195
442,13.78,88.37,585.9,0.06718,0.01055,0.009937,0.3563,2.235,29.34,15.27,97.90,706.6,0.10710,0.03517,0.03312
449,21.10,138.10,1384.0,0.11750,0.15720,0.115500,0.6643,4.542,81.89,25.68,168.20,2022.0,0.31010,0.43990,0.22800


177    1
169    0
4      1
504    0
90     0
      ..
386    0
490    0
391    0
482    0
319    0
Name: diagnosis, Length: 455, dtype: int64

2      1
456    0
433    1
143    0
95     1
      ..
189    0
494    0
442    0
449    1
278    0
Name: diagnosis, Length: 114, dtype: int64

In [63]:
hyperparameters = [
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
     'C' : np.logspace(-4, 4, 20),
     'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
     'max_iter' : [100, 1000, 2500, 5000]
     }
]
log_model = LogisticRegression()
clf = GridSearchCV(log_model, param_grid = hyperparameters, cv = 3, n_jobs=-1)
display(np.isnan(X).sum())
clf.fit(X, y)



radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 