<a href="https://colab.research.google.com/github/Theophilus-Baidoo/Data--Driven-Survival-Modeling-for-Breast-Cancer/blob/main/Breast_ML_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

######## Import all necessity functions for machine Learning ########
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif, mutual_info_regression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTEN, SVMSMOTE, BorderlineSMOTE, KMeansSMOTE, ADASYN
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Perceptron, SGDClassifier, SGDRegressor
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, classification_report, silhouette_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
!pip install dask[dataframe]

######### Import all necessary functions for Neural Network ##########
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, ReLU, LeakyReLU, PReLU, ELU, BatchNormalization, Dropout
from tensorflow.keras.activations import relu, sigmoid, softmax, swish
from tensorflow.keras.initializers import HeNormal, HeUniform, GlorotNormal, GlorotUniform
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy, MSE, MAE, Huber
from tensorflow.keras.optimizers import SGD, Adagrad, Adadelta, RMSprop, Adam, Nadam, Adamax
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import L1, L2, L1L2

In [None]:
from sklearn import metrics
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
import random
random.seed(42)


In [None]:
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SEER Breast Cancer Dataset .csv")
df.head()

In [None]:
# rename the column 'Race ' to 'Race'
df = df.rename(columns={'Race ': 'Race'})
df = df.rename(columns={'T Stage ': 'TStage'})
df = df.rename(columns={'N Stage': 'NStage'})
df = df.rename(columns={'Marital Status': 'Marital'})
df = df.rename(columns={'6th Stage': '6thStage'})
df = df.rename(columns={'A Stage': 'AStage'})
df = df.rename(columns={'Estrogen Status': 'EStatus'})
df = df.rename(columns={'Progesterone Status': 'PStatus'})

In [None]:
df["Race"].replace("Other (American Indian/AK Native, Asian/Pacific Islander)", "Other", inplace=True)
df["Grade"].replace("Well differentiated; Grade I", "Grade I", inplace=True)
df["Grade"].replace("Moderately differentiated; Grade II", "Grade II", inplace=True)
df["Grade"].replace("Poorly differentiated; Grade III", "Grade III", inplace=True)
df["Grade"].replace("Undifferentiated; anaplastic; Grade IV", "Grade IV", inplace=True)
df["Marital"].replace("Single (never married)", "Single", inplace=True)
df["Marital"].replace("Married (including common law)", "Married", inplace=True)

In [None]:
df['Status'].replace(['Alive','Dead'],[0,1],inplace=True)
df['TStage'].replace(['T1','T2','T3','T4'],[0,1,2,3],inplace=True)
df['NStage'].replace(['N1','N2','N3'],[0,1,2],inplace=True)
df['EStatus'].replace(['Positive','Negative'],[1,0],inplace=True)
df['PStatus'].replace(['Positive','Negative'],[1,0],inplace=True)
df['AStage'].replace(['Regional','Distant'],[0,1],inplace=True)
df['Grade'].replace(['Grade I','Grade II','Grade III','Grade IV'],[0,1,2,3],inplace=True)


In [None]:
# Convert categorical variables into numerical ones using one-hot encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore')
cat_cols = ['Race','Marital']
encoded_cols = encoder.fit_transform(df[cat_cols])
encoded_cols_df = pd.DataFrame(encoded_cols.toarray(), columns=encoder.get_feature_names_out(cat_cols))

In [None]:
df = df.drop(("6thStage"),axis=1)
df = df.drop(("Survival Months"),axis=1)
df = df.drop(cat_cols, axis=1)
df = pd.concat([df, encoded_cols_df], axis=1)

In [None]:
y=df.loc[:,'Status']
df.drop(columns=['Status'],axis=1,inplace=True)
X=df.iloc[:].values
y=y

In [None]:
!pip install imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [None]:
#pip install -U scikit-learn


In [None]:
# Import necessary libraries
from scipy.stats import randint
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from collections import Counter
from imblearn.over_sampling import SMOTE
import numpy as np

# Set random state
random_state = 20

# Ensure that X and y are defined and not None (replace 'X' and 'y' with your dataset variables)
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

# Print information about the dataset
print("Total number of patients:", X.shape[0])  # Updated from 'df.shape[0]' assuming X is your dataset
print("Total number of patients in training set:", X_train.shape[0])
print("Total number of patients in test set:", X_test.shape[0])

# Initialize SMOTE
smote = SMOTE(random_state=random_state)

# Apply SMOTE on the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train.astype('float'), y_train)

# Display class distribution before and after SMOTE
print('Before SMOTE:', Counter(y_train))
print('After SMOTE:', Counter(y_train_smote))
print('Test dataset shape:', Counter(y_test.values.ravel()))


In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


In [None]:
# Lists to store evaluation metrics for each iteration
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
auc_scores = []
confusion_matrices = []

for i in range(10):
    # SVM with a different random_state in each iteration
    svm = SVC(kernel='rbf', probability=True, random_state=i)
    param_grid = {'C': [0.1, 1, 10],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}
    grid_search = GridSearchCV(estimator=svm,
                               param_grid=param_grid,
                               scoring='accuracy', n_jobs=-1, cv=5, verbose=2)
    grid_search.fit(X_train_smote, y_train_smote.values.ravel())
    best_svc = grid_search.best_estimator_

    # Fit and make predictions
    best_svc.fit(X_train_smote, y_train_smote.values.ravel())
    ypred1 = best_svc.predict(X_test)
    y_pred_proba = best_svc.predict_proba(X_test)[:, 1]

    # Calculate evaluation metrics and append to the respective lists
    accuracy_scores.append(accuracy_score(y_test, ypred1))
    precision_scores.append(precision_score(y_test, ypred1))
    recall_scores.append(recall_score(y_test, ypred1))
    f1_scores.append(f1_score(y_test, ypred1))
    auc_scores.append(roc_auc_score(y_test, y_pred_proba))
    confusion_matrices.append(confusion_matrix(y_test, ypred1))

# Calculating mean and standard deviation for each metric
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)
mean_precision = np.mean(precision_scores)
std_precision = np.std(precision_scores)
mean_recall = np.mean(recall_scores)
std_recall = np.std(recall_scores)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)

# Average confusion matrix
average_confusion_matrix = np.mean(confusion_matrices, axis=0)

# Output the results
(mean_accuracy, std_accuracy, mean_precision, std_precision, mean_recall, std_recall, mean_f1, std_f1, mean_auc, std_auc, average_confusion_matrix)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, auc

In [None]:
accuracy_scores1 = []
precision_scores1 = []
recall_scores1 = []
f1_scores1 = []
auc_scores1= []
confusion_matrices1 = []

for i in range(10):
    rn_forest = RandomForestClassifier(n_jobs=-1, random_state=i)
    param_grid = {
        "max_depth": range(1, 10),
        "min_samples_leaf": randint(25, 50),
        "min_samples_split": range(50, 100, 2),
        "n_estimators": range(1000, 8000, 500),
        "bootstrap": [True]
    }
    forest_grid = RandomizedSearchCV(rn_forest, param_grid, cv=5)
    forest_grid.fit(X_train_smote, y_train_smote.values.ravel())
    best_forest = forest_grid.best_estimator_
    best_forest.fit(X_train_smote, y_train_smote.values.ravel())
    ypred3 = best_forest.predict(X_test)
    y_pred_proba1 = best_forest.predict_proba(X_test)[:, 1]
    accuracy_scores1.append(accuracy_score(y_test, ypred3))
    precision_scores1.append(precision_score(y_test, ypred3))
    recall_scores1.append(recall_score(y_test, ypred3))
    f1_scores1.append(f1_score(y_test, ypred3))
    auc_scores1.append(roc_auc_score(y_test, y_pred_proba1))
    confusion_matrices1.append(confusion_matrix(y_test, ypred3))

mean_accuracy1 = np.mean(accuracy_scores1)
std_accuracy1 = np.std(accuracy_scores1)
mean_precision1 = np.mean(precision_scores1)
std_precision1 = np.std(precision_scores1)
mean_recall1 = np.mean(recall_scores1)
std_recall1 = np.std(recall_scores1)
mean_f11 = np.mean(f1_scores1)
std_f11 = np.std(f1_scores1)
mean_auc1 = np.mean(auc_scores1)
std_auc1 = np.std(auc_scores1)
average_confusion_matrix1 = np.mean(confusion_matrices1, axis=0)

(mean_accuracy1, std_accuracy1, mean_precision1, std_precision1, mean_recall1, std_recall1, mean_f11, std_f11, mean_auc1, std_auc1, average_confusion_matrix1)


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Lists to store evaluation metrics for each iteration
accuracy_scores2 = []
precision_scores2 = []
recall_scores2 = []
f1_scores2 = []
auc_scores2 = []
confusion_matrices2 = []

for i in range(10):
    # XGBoost with a different random_state in each iteration
    xgb = XGBClassifier(n_jobs=-1, random_state=i)
    param_grid = {
        'max_depth': range(4, 40),
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.6, 0.9, 0.95, 0.99],
        'n_estimators': range(100, 1100, 100),
        'min_child_weight': range(1, 11),
        'subsample': np.arange(0.1, 1.1, 0.1),
        'colsample_bytree': np.arange(0.1, 1.1, 0.1)
    }
    xgb_grid = RandomizedSearchCV(xgb, param_grid, cv=5)
    xgb_grid.fit(X_train_smote, y_train_smote.values.ravel())
    best_xgb = xgb_grid.best_estimator_

    # Fit and make predictions
    best_xgb.fit(X_train_smote, y_train_smote.values.ravel())
    ypred5 = best_xgb.predict(X_test)
    y_pred_proba2 = best_xgb.predict_proba(X_test)[:, 1]

    # Calculate evaluation metrics and append to the respective lists
    accuracy_scores2.append(accuracy_score(y_test, ypred5))
    precision_scores2.append(precision_score(y_test, ypred5))
    recall_scores2.append(recall_score(y_test, ypred5))
    f1_scores2.append(f1_score(y_test, ypred5))
    auc_scores2.append(roc_auc_score(y_test, y_pred_proba2))
    confusion_matrices2.append(confusion_matrix(y_test, ypred5))

# Calculating mean and standard deviation for each metric
mean_accuracy2 = np.mean(accuracy_scores2)
std_accuracy2 = np.std(accuracy_scores2)
mean_precision2 = np.mean(precision_scores2)
std_precision2 = np.std(precision_scores2)
mean_recall2 = np.mean(recall_scores2)
std_recall2 = np.std(recall_scores2)
mean_f12 = np.mean(f1_scores2)
std_f12 = np.std(f1_scores2)
mean_auc2 = np.mean(auc_scores2)
std_auc2 = np.std(auc_scores2)

# Average confusion matrix
average_confusion_matrix2 = np.mean(confusion_matrices2, axis=0)

# Output the results
(mean_accuracy2, std_accuracy2, mean_precision2, std_precision2, mean_recall2, std_recall2, mean_f12, std_f12, mean_auc2, std_auc2, average_confusion_matrix2)


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Lists to store evaluation metrics for each iteration
accuracy_scores3 = []
precision_scores3 = []
recall_scores3 = []
f1_scores3 = []
auc_scores3 = []
confusion_matrices3 = []

for i in range(10):
    # LGBM with a different random_state in each iteration
    lgb = LGBMClassifier(objective='binary', n_jobs=-1, random_state=i)
    param_grid = {
        'max_depth': range(4, 40),
        'num_leaves': range(20, 100),
        'learning_rate': np.arange(0.1, 1.0, 0.1),
        'n_estimators': range(100, 1100),
        'min_child_samples': range(1, 72),
        'subsample': np.arange(0.1, 1.1, 0.1),
        'colsample_bytree': np.arange(0.1, 1.1, 0.1)
    }
    lgb_grid = RandomizedSearchCV(lgb, param_grid, cv=5)
    lgb_grid.fit(X_train_smote, y_train_smote.values.ravel())
    best_lgb = lgb_grid.best_estimator_

    # Fit and make predictions
    best_lgb.fit(X_train_smote, y_train_smote.values.ravel())
    ypred6 = best_lgb.predict(X_test)
    y_pred_proba3 = best_lgb.predict_proba(X_test)[:, 1]

    # Calculate evaluation metrics and append to the respective lists
    accuracy_scores3.append(accuracy_score(y_test, ypred6))
    precision_scores3.append(precision_score(y_test, ypred6))
    recall_scores3.append(recall_score(y_test, ypred6))
    f1_scores3.append(f1_score(y_test, ypred6))
    auc_scores3.append(roc_auc_score(y_test, y_pred_proba3))
    confusion_matrices3.append(confusion_matrix(y_test, ypred6))

# Calculating mean and standard deviation for each metric
mean_accuracy3 = np.mean(accuracy_scores3)
std_accuracy3 = np.std(accuracy_scores3)
mean_precision3 = np.mean(precision_scores3)
std_precision3 = np.std(precision_scores3)
mean_recall3 = np.mean(recall_scores3)
std_recall3 = np.std(recall_scores3)
mean_f13 = np.mean(f1_scores3)
std_f13 = np.std(f1_scores3)
mean_auc3 = np.mean(auc_scores3)
std_auc3 = np.std(auc_scores3)

# Average confusion matrix
average_confusion_matrix3 = np.mean(confusion_matrices3, axis=0)

# Output the results
(mean_accuracy3, std_accuracy3, mean_precision3, std_precision3, mean_recall3, std_recall3, mean_f13, std_f13, mean_auc3, std_auc3, average_confusion_matrix3)


In [None]:
# Plotting ROC Curves for SVM and Random Forest with mean AUC values

# Recalculating the ROC Curve for SVM (using the last iteration's model)
y_pred_proba_svm = best_svc.predict_proba(X_test)[:, 1]
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_proba_svm)
auc_score_svm = roc_auc_score(y_test, y_pred_proba_svm)

# Recalculating the ROC Curve for Random Forest (using the last iteration's model)
y_pred_proba_rf = best_forest.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
auc_score_rf = roc_auc_score(y_test, y_pred_proba_rf)

#xgboost
y_pred_proba_xgb = best_xgb.predict_proba(X_test)[:, 1]
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_proba_xgb)
auc_score_xgb = roc_auc_score(y_test, y_pred_proba_xgb)

#lgb
y_pred_proba_lgb = best_lgb.predict_proba(X_test)[:, 1]
fpr_lgb, tpr_lgb, _ = roc_curve(y_test, y_pred_proba_lgb)
auc_score_lgb = roc_auc_score(y_test, y_pred_proba_lgb)

# Plotting ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_svm, tpr_svm, label=f'SVM (Mean AUC = {mean_auc:.2f})', color='blue')
plt.plot(fpr_rf, tpr_rf, label=f'RF (Mean AUC = {mean_auc1:.2f})', color='green')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGB (Mean AUC = {mean_auc2:.2f})', color='red')
plt.plot(fpr_lgb, tpr_lgb, label=f'LGB (Mean AUC = {mean_auc3:.2f})', color='yellow')
plt.plot([0, 1], [0, 1], color='darkgrey', linestyle='--')

plt.title('ROC Curve for ML models')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()


In [None]:
!pip install shap
!pip install transformers
!pip install nlp

In [None]:
#best_forest

In [None]:
import shap

In [None]:
explainer = shap.Explainer(best_forest.predict,X_test)
shap_values = explainer(X_test)


In [None]:
#shap.plots.beeswarm(shap_values)

In [None]:
shap.summary_plot(shap_values,feature_names=df.columns)

In [None]:

shap_values = shap.Explanation(values=shap_values.values, feature_names=df.columns)


In [None]:
shap.plots.bar(shap_values)


In [None]:
explainer1 = shap.Explainer(best_xgb.predict,X_test)
shap_values1 = explainer1(X_test)


In [None]:
shap.summary_plot(shap_values1,feature_names=df.columns)