In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay , classification_report , accuracy_score ,precision_recall_curve , roc_curve ,roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.exceptions import FitFailedWarning
import warnings
from sklearn.neural_network import MLPClassifier
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [None]:
df = pd.read_csv('./data.csv')
vars = df.iloc[:, 2:-1]
# vectorize the target data. M = 1 and B = 0 diagnosis
target = df.iloc[:, 1].apply(lambda x: 1 if x == 'M' else 0)

In [None]:
X =  pd.DataFrame(preprocessing.StandardScaler().fit_transform(vars))
y = target
X.columns = vars.columns
# Use ANOVA F-value as the scoring function for feature selection 
# f_classif is specify we using ANOVA
# k is the number of features we want to get
selector = SelectKBest(score_func=f_classif, k=15)
# Fit the selector to the data
selector.fit(vars, target)
# Get the scores and p-values of each feature
scores = selector.scores_
p_values = selector.pvalues_

results = pd.DataFrame({'Feature': vars.columns, 'Score': scores, 'p-value': p_values})
results.sort_values(by='Score', ascending=False, inplace=True)
# Select the top K features based on the scores
top_features = vars.columns[selector.get_support()]
# Display the top features
print(top_features)
print(results)

In [None]:
X = X.loc[:,top_features]
X

In [None]:
draw = X.loc[:, top_features]
#draw = pd.merge(X, y, left_index=True, right_index=True)
pca = PCA(n_components=1)
pca.fit(draw)
T = pca.transform(draw)
T = pd.DataFrame(T)

# plot the data
T.columns = ['PCA component 1', 'PCA component 2']
T.plot.scatter(x='PCA component 1', y='PCA component 2',
	marker='o', alpha=0.7, color=['red' if i=='ckd' else 'green' for i in y],
    title="red: ckd, green: not-ckd" )
plt.show()

In [None]:
# Stratify: split the training and testing by 80/20
train_features, test_features, train_targets, test_targets = train_test_split(X, y, train_size=0.8,test_size=0.2,stratify = y)

In [None]:
hyperparameters = [
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
     'C' : np.logspace(-4, 4, 20),
     'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
     'max_iter' : [1000, 10000, 25000, 50000]
     }
]
log_model = LogisticRegression(max_iter=10000)
clf = GridSearchCV(log_model, param_grid = hyperparameters, cv = 3, n_jobs=-1)
clf.fit(train_features, train_targets)

In [None]:
display(clf.best_params_)
display(clf.best_estimator_)
display(clf.best_index_)
display(clf.best_score_)

In [None]:
model_1 = LogisticRegression(max_iter=clf.best_params_['max_iter'],C=clf.best_params_['C'],penalty=clf.best_params_['penalty'],solver=clf.best_params_['solver'])
model_1.fit(train_features, train_targets)

In [None]:
prediction = model_1.predict(test_features)

In [None]:
accuracy_score(test_targets, prediction)

In [None]:
CM = confusion_matrix(test_targets, prediction)
tp, fp, fn, tn = np.ravel(CM)
p = tp / (tp + fp)
r = tp / (tp + fn)
f1 = (2 * p * r) / (p + r)

In [None]:
cm_display = ConfusionMatrixDisplay(confusion_matrix=CM)
cm_display.plot(cmap='Blues')

In [None]:
hyperparameters = {
    'hidden_layer_sizes': [(25,), (50,), (100,), (150,), (200,)],
    'activation': ['logistic', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [100, 200, 300, 400, 500]
}
nn_model = MLPClassifier()
clf2 = GridSearchCV(nn_model, param_grid=hyperparameters, cv=5)
clf2.fit(train_features, train_targets)



In [None]:
clf2.best_params_

In [None]:
model_1 = MLPClassifier(activation=clf2.best_params_['activation'],
                        alpha=clf2.best_params_['alpha'],
                        hidden_layer_sizes=clf2.best_params_['hidden_layer_sizes'],
                        learning_rate=clf2.best_params_['learning_rate'],
                        max_iter=clf2.best_params_['max_iter'],
                        solver=clf2.best_params_['solver'])
model_1.fit(train_features, train_targets)

In [None]:
prediction1 = model_1.predict(test_features)
accuracy_score(test_targets, prediction1)

In [None]:
CM = confusion_matrix(test_targets, prediction1)
tp, fp, fn, tn = np.ravel(CM)
p = tp / (tp + fp)
r = tp / (tp + fn)
f1 = (2 * p * r) / (p + r)
display([tp, fp, fn, tn, p, r, f1])

In [None]:
cm_display = ConfusionMatrixDisplay(confusion_matrix=CM)
cm_display.plot(cmap='Blues')