# Experiment 1. Predicting risk of recurrence and risk of death

## Reading the dataset and removing columns

In [108]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

# Read dataset
duke = pd.read_excel(
    "../data/DUKE_edit.xlsx",
    skiprows = [0,2],
    usecols = "A:CT"
)

# Remove empty column and irrelevants id and position in a 3D image columns
duke = duke.drop(columns=['Unnamed: 67', 'Patient ID', 'Image Position of Patient'])

# Columns containing info from labels and correlated to them
label_cols = list(range(58,65))

# Columns containing info from images (MRI and US)
image_cols = list(range(17)) + list(range(46,51)) + list(range(66,72))

# Create DataFrames removing unnecessary columns
labels = duke.iloc[:, label_cols]
data = duke.drop(labels, axis=1)
#data = data_image.drop(data_image.columns[image_cols], axis=1)

# Print the shapes of each DataFrame
print(duke.shape, labels.shape, data.shape)
labels.head()

(922, 95) (922, 7) (922, 88)


Unnamed: 0,Recurrence event(s),Days to local recurrence,Days to distant recurrence,Days to death,Days to last local recurrence free assessment,Days to last distant recurrence free assessment,Days known alive / to death
0,0.0,NP,NP,NP,2940,2940,2940.0
1,0.0,NP,NP,NP,1649,1649,1649.0
2,0.0,NP,NP,NP,1697,1697,
3,0.0,NP,NP,NP,1990,1990,1990.0
4,0.0,NP,NP,NP,1845,1845,1845.0


In [105]:
# Print the columns' labels so that they can be easily used
print(data.columns)

Index(['Days to MRI', 'Manufacturer', 'Manufacturer Model Name',
       'Scan Options', 'Field Strength (Tesla)', 'Patient Position During MRI',
       'Contrast Agent', 'Contrast Bolus Volume (mL)', 'TR (Repetition Time)',
       'TE (Echo Time)', 'Acquisition Matrix', 'Slice Thickness ', 'Rows',
       'Columns', 'Reconstruction Diameter ', 'Flip Angle \n',
       'FOV Computed (Field of View) in cm ', 'Date of Birth (Days)',
       'Menopause (at diagnosis)', 'Race and Ethnicity',
       'Metastatic at Presentation (Outside of Lymph Nodes)', 'ER', 'PR',
       'HER2', 'Mol Subtype', 'Oncotype score', 'Staging(Tumor Size)# [T]',
       'Staging(Nodes)#(Nx replaced by -1)[N]',
       'Staging(Metastasis)#(Mx -replaced by -1)[M]', 'Tumor Grade (T)',
       'Tumor Grade (N)', 'Tumor Grade (M)', 'Nottingham grade',
       'Histologic type', 'Tumor Location', 'Position',
       'Bilateral breast cancer?', 'If Bilateral, Different Rec Status',
       'Bilateral side annotated', 'If bilater

## Preprocessing data

In [109]:
from sklearn.preprocessing import LabelEncoder

# Print types of values of a int column containing strings
print(data['If Bilateral, Different Rec Status'].value_counts(), end='\n\n')

# Replace all instances of 'NC' and 'NP' with NaN
data.replace('NC', np.nan, inplace=True)
data.replace('NP', np.nan, inplace=True)

# Print again
print(data['If Bilateral, Different Rec Status'].value_counts())

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Loop through columns that are of type 'object' (categorical columns)
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

If Bilateral, Different Rec Status
NP    623
NC    271
0      26
1       2
Name: count, dtype: int64

If Bilateral, Different Rec Status
0.0    26
1.0     2
Name: count, dtype: int64


In [110]:
# Recurrence and death will be the labels for our experiment
labels.loc[:,'Local recurrence'] = np.where(labels['Days to local recurrence'] != 'NP', 1, 0)
labels.loc[:,'Distant recurrence'] = np.where(labels['Days to distant recurrence'] != 'NP', 1, 0)
labels.loc[:,'Recurrence'] = np.logical_or(labels['Local recurrence'], labels['Distant recurrence']).astype(int)
labels.loc[:,'Dead'] = np.where(labels['Days to death'] != 'NP', 1, 0)

# Number how many of each label we have
print(labels['Local recurrence'].value_counts(), labels['Distant recurrence'].value_counts(),
      labels['Recurrence'].value_counts(), labels['Dead'].value_counts(), sep="\n\n")

Local recurrence
0    906
1     16
Name: count, dtype: int64

Distant recurrence
0    846
1     76
Name: count, dtype: int64

Recurrence
0    832
1     90
Name: count, dtype: int64

Dead
0    860
1     62
Name: count, dtype: int64


## Predicting risk of recurrence

In [111]:
from sklearn.model_selection import train_test_split

# Define target label
y = labels['Recurrence']

# Perform stratified 80/20 split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, stratify=y, random_state=42)

# Display the class distribution in the train and test sets
print("Training set class distribution:\n", y_train.value_counts())
print()
print("Test set class distribution:\n", y_test.value_counts())

Training set class distribution:
 Recurrence
0    665
1     72
Name: count, dtype: int64

Test set class distribution:
 Recurrence
0    167
1     18
Name: count, dtype: int64


In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values