In [72]:
# Data Handleling
import numpy as np
import pandas as pd

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_bool_dtype
from pandas.api.types import is_categorical_dtype

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Statistical Analysis
import scipy.stats as st
import statsmodels.stats.api as stats
import statsmodels.formula.api as smf
#from sksurv.nonparametric import kaplan_meier_estimator

# Warnings and Messages
from warnings import simplefilter
simplefilter('ignore', category=DeprecationWarning)
simplefilter('ignore', category=FutureWarning)
from logging import warning as warn

# Print Options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1200)
np.set_printoptions(precision = 3, edgeitems=20, linewidth=300)


In [73]:
def nan_compare(df, col1, col2, greater = False, equal = False):
    if greater and equal:
        ret = df[col1].ge(df[col2].values)
    elif greater and not equal:
        ret = df[col1].gt(df[col2].values)
    elif not greater and equal:
        ret = df[col1].le(df[col2].values)
    else:
        ret = df[col1].lt(df[col2].values)

    # impute nan
    ret = ret.astype('boolean')
    ret[df[col1].isna()] = pd.NA 
    ret[df[col2].isna()] = pd.NA 

    return ret

In [74]:
df = pd.read_parquet('../data/processed/patient_records.parquet')
print("shape:", df.shape)
display(df.head(n = 10))

shape: (11749, 28)


Unnamed: 0,ehr,is_alive,age,diag_dur,age_bin,pregnancy,birth,caesarean,abort,menarche_age,menopause_age,n_tumor,t_category,n_category,m_category,t_category_after_neoadj,n_category_after_neoadj,m_category_after_neoadj,stage_diagnosis,stage_after_neo,grade,ductal,lobular,neoadjuvant,er,her2,ki67,pr
0,10011773,True,64.0,105.0,Senior,,0.0,,0.0,,,1.0,IS,0.0,False,,,,0,,1.0,,,False,False,False,0.19,
1,10020495,True,70.0,73.0,Senior +,,,,,,,1.0,1,0.0,False,,,,IA,,2.0,True,False,False,True,False,0.09,False
2,10020495,True,70.0,73.0,Senior +,,,,,,,2.0,3,1.0,False,2.0,0.0,,IA,,2.0,True,False,True,True,False,0.09,False
3,10030299,True,57.0,54.0,Senior,3.0,3.0,0.0,0.0,,,1.0,1,0.0,False,,,,IA,,1.0,True,False,False,True,,0.18,True
4,10030824,True,70.0,64.0,Senior +,0.0,0.0,0.0,0.0,,44.0,1.0,2,1.0,False,2.0,2.0,False,IIIA,IIIA,3.0,False,True,True,True,False,,True
5,10041592,True,64.0,61.0,Senior,,,,,,,1.0,1,1.0,False,4.0,0.0,False,IB,IA,2.0,True,False,True,False,False,0.65,False
6,10053435,True,66.0,73.0,Senior +,2.0,1.0,0.0,1.0,,,1.0,0,0.0,False,,,,IV,,3.0,True,False,False,True,False,,True
7,10070718,True,62.0,58.0,Senior,,,,,,,1.0,0,1.0,False,2.0,3.0,False,IIA,IIIC,3.0,True,False,True,,True,0.3,False
8,10072646,True,61.0,72.0,Senior,,,,,,,1.0,2,1.0,False,1.0,1.0,False,IIB,IA,1.0,False,True,True,False,False,0.28,True
9,10085060,True,67.0,76.0,Senior +,,,,,,,1.0,0,1.0,False,1.0,0.0,True,IIA,IA,2.0,True,False,True,True,False,0.14,


Quitar:
* date_bin o age
* (t_category	n_category	m_category) o (stage_diagnosis)
* (t_category_after_neoadj	n_category_after_neoadj	m_category_after_neoadj) or (stage_after_neo)

Imputar con la media poblacional
* menopause_age, menarche_age
* birth	caesarean abort: imputar con 0 
* pregnancy = birth	+ caesarean + abort


In [75]:
df.head(n = 10)

Unnamed: 0,ehr,is_alive,age,diag_dur,age_bin,pregnancy,birth,caesarean,abort,menarche_age,menopause_age,n_tumor,t_category,n_category,m_category,t_category_after_neoadj,n_category_after_neoadj,m_category_after_neoadj,stage_diagnosis,stage_after_neo,grade,ductal,lobular,neoadjuvant,er,her2,ki67,pr
0,10011773,True,64.0,105.0,Senior,,0.0,,0.0,,,1.0,IS,0.0,False,,,,0,,1.0,,,False,False,False,0.19,
1,10020495,True,70.0,73.0,Senior +,,,,,,,1.0,1,0.0,False,,,,IA,,2.0,True,False,False,True,False,0.09,False
2,10020495,True,70.0,73.0,Senior +,,,,,,,2.0,3,1.0,False,2.0,0.0,,IA,,2.0,True,False,True,True,False,0.09,False
3,10030299,True,57.0,54.0,Senior,3.0,3.0,0.0,0.0,,,1.0,1,0.0,False,,,,IA,,1.0,True,False,False,True,,0.18,True
4,10030824,True,70.0,64.0,Senior +,0.0,0.0,0.0,0.0,,44.0,1.0,2,1.0,False,2.0,2.0,False,IIIA,IIIA,3.0,False,True,True,True,False,,True
5,10041592,True,64.0,61.0,Senior,,,,,,,1.0,1,1.0,False,4.0,0.0,False,IB,IA,2.0,True,False,True,False,False,0.65,False
6,10053435,True,66.0,73.0,Senior +,2.0,1.0,0.0,1.0,,,1.0,0,0.0,False,,,,IV,,3.0,True,False,False,True,False,,True
7,10070718,True,62.0,58.0,Senior,,,,,,,1.0,0,1.0,False,2.0,3.0,False,IIA,IIIC,3.0,True,False,True,,True,0.3,False
8,10072646,True,61.0,72.0,Senior,,,,,,,1.0,2,1.0,False,1.0,1.0,False,IIB,IA,1.0,False,True,True,False,False,0.28,True
9,10085060,True,67.0,76.0,Senior +,,,,,,,1.0,0,1.0,False,1.0,0.0,True,IIA,IA,2.0,True,False,True,True,False,0.14,


In [76]:
def preprocess(df, stage = False):
    """
    Quitar:
    * date_bin o age
    * (t_category	n_category	m_category) o (stage_diagnosis)
    * (t_category_after_neoadj	n_category_after_neoadj	m_category_after_neoadj) or (stage_after_neo)

    Imputar con la media poblacional
    * menopause_age, menarche_age
    * birth	caesarean abort: imputar con 0 
    * pregnancy = birth	+ caesarean + abort
    """

    df = df.copy()

    df['stage_worsen'] = nan_compare(df, 'stage_diagnosis', 'stage_after_neo', greater=False, equal=False)

    # Imputar
    # Demo
    df['diag_dur'] = df['diag_dur'].fillna(df['diag_dur'].median())
    # Gine
    df['birth'] = df['birth'].fillna(0)
    df['caesarean'] = df['caesarean'].fillna(0)
    df['abort'] = df['abort'].fillna(0)
    df['menopause_age'] = df['menopause_age'].fillna(51)
    df['menarche_age'] = df['menarche_age'].fillna(12)

    df['pregnancy'] = df['pregnancy'].fillna(df['birth'] + df['caesarean'] + df['abort'])

    # Tumor
    df['n_tumor'] = df['n_tumor'].fillna(1)
    df['ductal'] = df['ductal'].fillna(1)
    df['lobular'] = df['lobular'].fillna(0)


    # Nomenclatura TNM: eliminar redundancia
    if stage:
        df = df.drop(columns = ['t_category', 'n_category', 'm_category', 't_category_after_neoadj', 'n_category_after_neoadj', 'm_category_after_neoadj'], errors = 'raise')
        df['stage_diagnosis'] = df['stage_diagnosis'].cat.codes
        df['stage_after_neo'] = df['stage_after_neo'].cat.codes
    else:
        df = df.drop(columns = ['stage_diagnosis', 'stage_after_neo'], errors = 'raise')
        df['t_category'] = df['t_category'].cat.codes
        df['t_category_after_neoadj'] = df['t_category_after_neoadj'].cat.codes

    df = df.loc[df['neoadjuvant'].notna(), :]
    df = df.drop(columns = ['age_bin'])

    
    # histochemistry
    #df['er'] = df['er'].fillna(1)
    #df['her2'] = df['her2'].fillna(0)
    #df['pr'] = df['pr'].fillna(1)
    #df['ki67'] = df['ki67'].fillna(0.18) # Mediana
    
    return df

ddf = preprocess(df, stage = False)
display	(ddf.head(n = 10))
ddf.isna().sum(axis = 0)

Unnamed: 0,ehr,is_alive,age,diag_dur,pregnancy,birth,caesarean,abort,menarche_age,menopause_age,n_tumor,t_category,n_category,m_category,t_category_after_neoadj,n_category_after_neoadj,m_category_after_neoadj,grade,ductal,lobular,neoadjuvant,er,her2,ki67,pr,stage_worsen
0,10011773,True,64.0,105.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,0,0.0,False,-1,,,1.0,True,False,False,False,False,0.19,,
1,10020495,True,70.0,73.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,2,0.0,False,-1,,,2.0,True,False,False,True,False,0.09,False,
2,10020495,True,70.0,73.0,0.0,0.0,0.0,0.0,12.0,51.0,2.0,4,1.0,False,3,0.0,,2.0,True,False,True,True,False,0.09,False,
3,10030299,True,57.0,54.0,3.0,3.0,0.0,0.0,12.0,51.0,1.0,2,0.0,False,-1,,,1.0,True,False,False,True,,0.18,True,
4,10030824,True,70.0,64.0,0.0,0.0,0.0,0.0,12.0,44.0,1.0,3,1.0,False,3,2.0,False,3.0,False,True,True,True,False,,True,False
5,10041592,True,64.0,61.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,2,1.0,False,5,0.0,False,2.0,True,False,True,False,False,0.65,False,False
6,10053435,True,66.0,73.0,2.0,1.0,0.0,1.0,12.0,51.0,1.0,1,0.0,False,-1,,,3.0,True,False,False,True,False,,True,
7,10070718,True,62.0,58.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,1,1.0,False,3,3.0,False,3.0,True,False,True,,True,0.3,False,True
8,10072646,True,61.0,72.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,3,1.0,False,2,1.0,False,1.0,False,True,True,False,False,0.28,True,False
9,10085060,True,67.0,76.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,1,1.0,False,2,0.0,True,2.0,True,False,True,True,False,0.14,,False


ehr                           0
is_alive                      0
age                           0
diag_dur                      0
pregnancy                     0
birth                         0
caesarean                     0
abort                         0
menarche_age                  0
menopause_age                 0
n_tumor                       0
t_category                    0
n_category                    0
m_category                    0
t_category_after_neoadj       0
n_category_after_neoadj    5776
m_category_after_neoadj    5827
grade                         0
ductal                        0
lobular                       0
neoadjuvant                   0
er                         1125
her2                       1162
ki67                       1133
pr                         1156
stage_worsen               5371
dtype: int64

In [90]:
import imblearn
from imblearn.over_sampling import SMOTE

from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


# 2 modelo: 

In [101]:
# split data
neo_df = ddf.loc[ddf['neoadjuvant'].values, ]
neo_df = neo_df.drop(columns = ['ehr', 'stage_worsen', 'neoadjuvant'])
neo_df = neo_df.dropna()
print("shape:", neo_df.shape)
display(neo_df.head(n = 10))

X = neo_df.drop(columns=['is_alive'])
Y = neo_df['is_alive']

shape: (3928, 23)


Unnamed: 0,is_alive,age,diag_dur,pregnancy,birth,caesarean,abort,menarche_age,menopause_age,n_tumor,t_category,n_category,m_category,t_category_after_neoadj,n_category_after_neoadj,m_category_after_neoadj,grade,ductal,lobular,er,her2,ki67,pr
5,True,64.0,61.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,2,1.0,False,5,0.0,False,2.0,True,False,False,False,0.65,False
8,True,61.0,72.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,3,1.0,False,2,1.0,False,1.0,False,True,False,False,0.28,True
10,True,77.0,77.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,3,1.0,False,2,0.0,False,2.0,True,False,False,False,0.19,False
11,True,62.0,79.0,0.0,0.0,0.0,0.0,16.0,55.0,1.0,2,0.0,False,2,0.0,False,2.0,True,False,True,False,0.72,True
13,True,61.0,73.0,1.0,1.0,0.0,0.0,15.0,48.0,1.0,3,1.0,False,3,2.0,False,1.0,True,False,False,True,0.52,False
14,False,67.0,72.0,3.0,3.0,0.0,0.0,14.0,53.0,1.0,2,1.0,False,2,0.0,False,2.0,True,False,True,False,0.12,False
15,True,72.0,84.0,7.0,7.0,0.0,0.0,12.0,51.0,1.0,2,0.0,False,2,0.0,True,3.0,True,False,True,False,0.15,True
18,True,51.0,63.0,1.0,0.0,0.0,1.0,14.0,51.0,1.0,2,0.0,False,2,0.0,True,2.0,False,True,True,False,0.2,True
19,True,51.0,63.0,1.0,0.0,0.0,1.0,14.0,51.0,2.0,2,1.0,False,2,0.0,True,2.0,False,True,True,False,0.2,True
22,True,72.0,6.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,3,0.0,False,2,1.0,False,1.0,False,True,False,False,0.25,True


In [102]:
# upsample
smote = SMOTE(sampling_strategy = 0.5, random_state = 10101)
X = X.astype(float)
up_x, up_y = smote.fit_resample(X, Y)
up_x.head()

Unnamed: 0,age,diag_dur,pregnancy,birth,caesarean,abort,menarche_age,menopause_age,n_tumor,t_category,n_category,m_category,t_category_after_neoadj,n_category_after_neoadj,m_category_after_neoadj,grade,ductal,lobular,er,her2,ki67,pr
0,64.0,61.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,2.0,1.0,0.0,5.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.65,0.0
1,61.0,72.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,3.0,1.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.28,1.0
2,77.0,77.0,0.0,0.0,0.0,0.0,12.0,51.0,1.0,3.0,1.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.19,0.0
3,62.0,79.0,0.0,0.0,0.0,0.0,16.0,55.0,1.0,2.0,0.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.72,1.0
4,61.0,73.0,1.0,1.0,0.0,0.0,15.0,48.0,1.0,3.0,1.0,0.0,3.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,0.52,0.0


In [81]:
x_train, x_test, y_train, y_test = train_test_split(up_x, up_y, train_size = 0.8, random_state = 10101)
print("train shape:", x_train.shape)
print("test  shape:", x_test.shape)

train shape: (4204, 22)
test  shape: (1052, 22)


In [89]:
log_model = LogisticRegressionCV(Cs = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.15, 1.20, 1.25, 10], max_iter = 10000, cv = 10, random_state = 10101)
log_model = log_model.fit(x_train, y_train)

print(log_model.C_)

y_pred = log_model.predict(x_test)
print(classification_report(y_test, y_pred))

[0.5]
              precision    recall  f1-score   support

         0.0       0.58      0.25      0.35       360
         1.0       0.70      0.90      0.79       692

    accuracy                           0.68      1052
   macro avg       0.64      0.58      0.57      1052
weighted avg       0.66      0.68      0.64      1052



In [107]:
svc_model = SVC()
params = {'C': np.power(10.0, np.arange(-5, 2))}
grid = GridSearchCV(svc_model, params, cv = 10, verbose = 2, n_jobs=3, refit = True)
grid = grid.fit(x_train, y_train)
print(grid.best_params_)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
{'C': 10.0}


In [108]:
svc_model = grid.best_estimator_
y_pred = svc_model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.64      0.17      0.27       360
         1.0       0.69      0.95      0.80       692

    accuracy                           0.68      1052
   macro avg       0.66      0.56      0.53      1052
weighted avg       0.67      0.68      0.62      1052



In [109]:
random_forest = RandomForestClassifier(random_state=10101)
params = {'n_estimators': np.arange(50, 501, 50), 'max_depth': [1, 5]}
grid = GridSearchCV(random_forest, params, cv = 10, verbose = 2, n_jobs=3, refit = True)
grid = grid.fit(x_train, y_train)
print(grid.best_params_)


Fitting 10 folds for each of 20 candidates, totalling 200 fits
{'max_depth': 5, 'n_estimators': 200}


In [110]:
random_forest = grid.best_estimator_
y_pred = random_forest.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.58      0.73       360
         1.0       0.82      1.00      0.90       692

    accuracy                           0.85      1052
   macro avg       0.90      0.79      0.81      1052
weighted avg       0.88      0.85      0.84      1052

