In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Load the dataset
bc_prep = pd.read_csv('bc_prep.csv')



In [3]:
import pandas as pd

# Load the dataset
try:
    bc_prep = pd.read_csv('bc_prep.csv')
except FileNotFoundError:
    print("The file 'bc_prep.csv' was not found. Please check the file path and try again.")

# Display the first few rows of the dataset
print(bc_prep.head())

  Patient.ID  Age.at.Diagnosis  Type.of.Breast.Surgery  Cellularity  \
0    MB-0002             43.19                       0            0   
1    MB-0005             48.87                       1            0   
2    MB-0006             47.68                       1            1   
3    MB-0008             76.97                       1            0   
4    MB-0010             78.77                       1            1   

   Chemotherapy  Cohort  ER.Status  Neoplasm.Histologic.Grade  HER2.Status  \
0             0       1          0                          3            0   
1             1       1          0                          2            0   
2             1       1          0                          2            0   
3             1       1          0                          3            0   
4             0       1          0                          3            0   

   Hormone.Therapy  Inferred.Menopausal.State  Primary.Tumor.Laterality  \
0                0           

In [3]:

# Drop the Patient.ID column
bc_data = bc_prep.drop(columns=['Patient.ID'])

# Display the first few rows of the modified dataset
print(bc_data.head())

   Age.at.Diagnosis  Type.of.Breast.Surgery  Cellularity  Chemotherapy  \
0             43.19                       0            0             0   
1             48.87                       1            0             1   
2             47.68                       1            1             1   
3             76.97                       1            0             1   
4             78.77                       1            1             0   

   Cohort  ER.Status  Neoplasm.Histologic.Grade  HER2.Status  Hormone.Therapy  \
0       1          0                          3            0                0   
1       1          0                          2            0                0   
2       1          0                          2            0                0   
3       1          0                          3            0                0   
4       1          0                          3            0                0   

   Inferred.Menopausal.State  Primary.Tumor.Laterality  \
0         

In [5]:
# Prepare the data
X = bc_data.drop(columns=['Overall.Survival.Status'])
y = bc_data['Overall.Survival.Status'].apply(lambda x: 1 if x == 'Deceased' else 0)

In [7]:

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [9]:
# Preprocessing pipeline for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

In [15]:
# Fit the preprocessor to the training data to ensure it is fitted before accessing feature names
preprocessor.fit(X)

In [17]:
print(X.head())

   Age.at.Diagnosis  Type.of.Breast.Surgery  Cellularity  Chemotherapy  \
0             43.19                       0            0             0   
1             48.87                       1            0             1   
2             47.68                       1            1             1   
3             76.97                       1            0             1   
4             78.77                       1            1             0   

   Cohort  ER.Status  Neoplasm.Histologic.Grade  HER2.Status  Hormone.Therapy  \
0       1          0                          3            0                0   
1       1          0                          2            0                0   
2       1          0                          2            0                0   
3       1          0                          3            0                0   
4       1          0                          3            0                0   

   Inferred.Menopausal.State  Primary.Tumor.Laterality  \
0         

In [19]:
# Train and evaluate models
for name, model in models.items():
    print(f"Training {name}...")
    # Create a pipeline that first preprocesses the data and then fits the model
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

Training Random Forest...
Training Decision Tree...
Training SVM...
Training Logistic Regression...


In [21]:
# Random Forest
print("Training Random Forest...")
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42))])
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

# Metrics
print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))


Training Random Forest...
Random Forest Performance:
Accuracy: 0.6800
Recall: 0.6063
Precision: 0.6260
F1 Score: 0.6160
              precision    recall  f1-score   support

           0       0.72      0.73      0.73       173
           1       0.63      0.61      0.62       127

    accuracy                           0.68       300
   macro avg       0.67      0.67      0.67       300
weighted avg       0.68      0.68      0.68       300



In [25]:

# determine the mutual information for RF
mutual_info_rf = mutual_info_classif(X_train, y_train)
mutual_info_rf

array([0.05293665, 0.03484691, 0.        , 0.        , 0.05196546,
       0.00268197, 0.        , 0.        , 0.0104609 , 0.        ,
       0.01122722, 0.05650068, 0.01610718, 0.        , 0.03899484,
       0.02777913])

In [27]:
sel_five_cols_rf = SelectKBest(mutual_info_classif, k=5)
sel_five_cols_rf.fit(X_train, y_train)
X_train.columns[sel_five_cols_rf.get_support()]

Index(['Age.at.Diagnosis', 'Type.of.Breast.Surgery', 'Cellularity', 'Cohort',
       'ER.Status'],
      dtype='object')

In [29]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree
print("Training Decision Tree...")
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier(random_state=42))])
dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)

# Metrics
print("Decision Tree Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_dt):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_dt):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_dt):.4f}")
print(classification_report(y_test, y_pred_dt))



Training Decision Tree...
Decision Tree Performance:
Accuracy: 0.5967
Recall: 0.5906
Precision: 0.5208
F1 Score: 0.5535
              precision    recall  f1-score   support

           0       0.67      0.60      0.63       173
           1       0.52      0.59      0.55       127

    accuracy                           0.60       300
   macro avg       0.59      0.60      0.59       300
weighted avg       0.60      0.60      0.60       300



In [31]:

# determine the mutual information
mutual_info_dt = mutual_info_classif(X_train, y_train)
mutual_info_dt

array([0.05120242, 0.03137275, 0.01230359, 0.        , 0.02168435,
       0.00891689, 0.01182559, 0.02657381, 0.04037003, 0.        ,
       0.00668397, 0.04621815, 0.        , 0.        , 0.01789627,
       0.03816389])

In [33]:
sel_five_cols_dt = SelectKBest(mutual_info_classif, k=5)
sel_five_cols_dt.fit(X_train, y_train)
X_train.columns[sel_five_cols_dt.get_support()]

Index(['Age.at.Diagnosis', 'Chemotherapy', 'HER2.Status',
       'Lymph.nodes.examined.positive', 'Tumor.Size'],
      dtype='object')

In [35]:
from sklearn.svm import SVC

# SVM
print("Training SVM...")
svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC(probability=True, random_state=42))])
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)

# Metrics
print("SVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_svm):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_svm):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_svm):.4f}")
print(classification_report(y_test, y_pred_svm))

Training SVM...
SVM Performance:
Accuracy: 0.7300
Recall: 0.6457
Precision: 0.6949
F1 Score: 0.6694
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       173
           1       0.69      0.65      0.67       127

    accuracy                           0.73       300
   macro avg       0.72      0.72      0.72       300
weighted avg       0.73      0.73      0.73       300



In [37]:
mutual_info_svm = mutual_info_classif(X_train, y_train)
mutual_info_svm

array([0.05399272, 0.01303223, 0.        , 0.03103861, 0.01439686,
       0.01083784, 0.00566751, 0.01428193, 0.        , 0.00911075,
       0.01759638, 0.05622383, 0.00645215, 0.0053454 , 0.03927915,
       0.        ])

In [39]:
sel_five_cols_svm = SelectKBest(mutual_info_classif, k=5)
sel_five_cols_svm.fit(X_train, y_train)
X_train.columns[sel_five_cols_svm.get_support()]

Index(['Age.at.Diagnosis', 'Type.of.Breast.Surgery',
       'Lymph.nodes.examined.positive', 'Tumor.Size', 'Tumor.Stage'],
      dtype='object')

In [41]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
print("Training Logistic Regression...")
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(random_state=42))])
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)

# Metrics
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_lr):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_lr):.4f}")
print(classification_report(y_test, y_pred_lr))

Training Logistic Regression...
Logistic Regression Performance:
Accuracy: 0.6967
Recall: 0.6378
Precision: 0.6429
F1 Score: 0.6403
              precision    recall  f1-score   support

           0       0.74      0.74      0.74       173
           1       0.64      0.64      0.64       127

    accuracy                           0.70       300
   macro avg       0.69      0.69      0.69       300
weighted avg       0.70      0.70      0.70       300



In [43]:
mutual_info_LR = mutual_info_classif(X_train, y_train)
mutual_info_LR

array([0.05247628, 0.02033533, 0.        , 0.02587   , 0.        ,
       0.02974689, 0.00417986, 0.        , 0.01692325, 0.        ,
       0.        , 0.04253536, 0.02648403, 0.01262752, 0.03435437,
       0.01867622])

In [45]:
sel_five_cols_lr = SelectKBest(mutual_info_classif, k=5)
sel_five_cols_lr.fit(X_train, y_train)
X_train.columns[sel_five_cols_lr.get_support()]

Index(['Age.at.Diagnosis', 'Type.of.Breast.Surgery', 'Cohort',
       'Inferred.Menopausal.State', 'Tumor.Stage'],
      dtype='object')

In [52]:
use other selection models

In [47]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
lr_model = LogisticRegression(max_iter=1000)

# Use RFE to select the top k features
rfe_selector = RFE(estimator=lr_model, n_features_to_select=5, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)

# Get the selected feature names
selected_features_rfe_lr = X_train.columns[rfe_selector.support_]

# Print the selected features
print("Selected features by RFE (Logistic Regression):")
print(selected_features_rfe_lr)


Selected features by RFE (Logistic Regression):
Index(['Type.of.Breast.Surgery', 'Cohort', 'Neoplasm.Histologic.Grade',
       'HER2.Status', 'Tumor.Stage'],
      dtype='object')


In [49]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier model
rf_model = RandomForestClassifier()

# Use RFE to select the top k features
rfe_selector_rf = RFE(estimator=rf_model, n_features_to_select=5, step=1)
rfe_selector_rf = rfe_selector_rf.fit(X_train, y_train)

# Get the selected feature names
selected_features_rfe_rf = X_train.columns[rfe_selector_rf.support_]

# Print the selected features
print("Selected features by RFE (Random Forest):")
print(selected_features_rfe_rf)


Selected features by RFE (Random Forest):
Index(['Age.at.Diagnosis', 'Cellularity', 'Cohort',
       'Lymph.nodes.examined.positive', 'Tumor.Size'],
      dtype='object')


In [51]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier model
rf_model = RandomForestClassifier()

# Fit the model to get feature importances
rf_model.fit(X_train, y_train)

# Use SelectFromModel to select features based on importance
model_selector = SelectFromModel(rf_model, prefit=True, threshold="mean")
selected_features_model_rf = X_train.columns[model_selector.get_support()]

# Print the selected features
print("Selected features by model-based selection (Random Forest):")
print(selected_features_model_rf)


Selected features by model-based selection (Random Forest):
Index(['Age.at.Diagnosis', 'Cohort', 'Lymph.nodes.examined.positive',
       'Tumor.Size'],
      dtype='object')


In [53]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
lr_model = LogisticRegression(max_iter=1000)

# Evaluate model performance using cross-validation
scores = cross_val_score(lr_model, X_train[selected_features_rfe_lr], y_train, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores (Logistic Regression):")
print(scores)
print("Mean accuracy:", scores.mean())


Cross-validation scores (Logistic Regression):
[0.64285714 0.63571429 0.62857143 0.63571429 0.66428571]
Mean accuracy: 0.6414285714285713


In [55]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Create an SVM model with a linear kernel
svm_model = SVC(kernel='linear')

# Use RFE to select the top k features
rfe_selector_svm = RFE(estimator=svm_model, n_features_to_select=5, step=1)
rfe_selector_svm = rfe_selector_svm.fit(X_train, y_train)

# Get the selected feature names
selected_features_rfe_svm = X_train.columns[rfe_selector_svm.support_]

# Print the selected features
print("Selected features by RFE (SVM):")
print(selected_features_rfe_svm)

# Evaluate model performance using cross-validation with selected features
scores_svm = cross_val_score(svm_model, X_train[selected_features_rfe_svm], y_train, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores (SVM):")
print(scores_svm)
print("Mean accuracy:", scores_svm.mean())


Selected features by RFE (SVM):
Index(['Type.of.Breast.Surgery', 'Chemotherapy', 'HER2.Status',
       'Inferred.Menopausal.State', 'Primary.Tumor.Laterality'],
      dtype='object')
Cross-validation scores (SVM):
[0.61428571 0.6        0.62142857 0.6        0.62142857]
Mean accuracy: 0.6114285714285714
