In [1]:
from preprocessing import readcounts_processing_pipeline, pheno_processing_pipeline
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pheno_df_train = pd.read_csv('./train/pheno_training.csv')
pheno_df_train.describe()

pheno_df_test = pd.read_csv('./test/pheno_test.csv')


- 2.6% of the patients in the cohort previously experienced HF.
- 8.4% of the patients have experienced HF after baseline

Note that the Event corresponds to HF after baseline ("excluding those occur before baseline" )

### Check that all patients with PrevalentHFAIL = 1 indeed have Event_time <0

In [3]:
(pheno_df_train["PrevalentHFAIL"] == 1).sum()

93

93 patients experienced HF before the baseline

In [4]:
(pheno_df_train["Event_time"] < 0).sum()

93

In [5]:
((pheno_df_train["Event_time"] < 0) & (pheno_df_train["PrevalentHFAIL"] == 1)).sum()  

93

### Check that all patients with Event_time <0 (that is, PrevalentHFAIL == 1) have Event == 0

In [6]:
artifacts = (pheno_df_train["Event_time"] < 0) & (pheno_df_train["Event"] == 1) 
artifacts.sum() 

7

This is an artifact from the simulation (see discussion on the challenge page)

In [7]:
pheno_df_train = pheno_processing_pipeline(pheno_df_train)
pheno_df_train

Unnamed: 0,Age,BodyMassIndex,Smoking=1,BPTreatment=1,PrevalentDiabetes=1,PrevalentCHD=1,Event,Event_time,SystolicBP,NonHDLcholesterol,Sex=1
Simulated_328,53.618,24.127,0.0,0.0,0.0,0.0,False,15.75,133.077,3.02,0.0
Simulated_1644,36.811,27.992,0.0,0.0,0.0,0.0,False,15.881,108.914,5.48,0.0
Simulated_1710,49.429,23.664,0.0,0.0,0.0,0.0,False,15.891,110.064,4.388,1.0
Simulated_1732,48.842,26.804,0.0,0.0,0.0,0.0,False,15.918,128.059,5.119,0.0
Simulated_1727,60.738,29.862,0.0,0.0,0.0,0.0,False,15.841,169.913,5.74,1.0
...,...,...,...,...,...,...,...,...,...,...,...
Simulated_1783,33.802,37.049,0.0,0.0,0.0,0.0,False,15.942,109.08,3.141,0.0
Simulated_3425,69.249,36.8,0.0,0.0,1.0,0.0,False,15.781,145.953,5.478,1.0
Simulated_1789,28.561,26.463,0.0,0.0,0.0,0.0,False,12.198,124.091,4.87,1.0
Simulated_1592,70.278,31.945,0.0,1.0,0.0,0.0,False,15.609,142.038,2.492,0.0


- BPTreatment = Blood pressure treatment
- Prevalent CHD = Prevalent coronary heart disease

### Check that training set and test set do not intersect

In [8]:
pheno_df_train

Unnamed: 0,Age,BodyMassIndex,Smoking=1,BPTreatment=1,PrevalentDiabetes=1,PrevalentCHD=1,Event,Event_time,SystolicBP,NonHDLcholesterol,Sex=1
Simulated_328,53.618,24.127,0.0,0.0,0.0,0.0,False,15.75,133.077,3.02,0.0
Simulated_1644,36.811,27.992,0.0,0.0,0.0,0.0,False,15.881,108.914,5.48,0.0
Simulated_1710,49.429,23.664,0.0,0.0,0.0,0.0,False,15.891,110.064,4.388,1.0
Simulated_1732,48.842,26.804,0.0,0.0,0.0,0.0,False,15.918,128.059,5.119,0.0
Simulated_1727,60.738,29.862,0.0,0.0,0.0,0.0,False,15.841,169.913,5.74,1.0
...,...,...,...,...,...,...,...,...,...,...,...
Simulated_1783,33.802,37.049,0.0,0.0,0.0,0.0,False,15.942,109.08,3.141,0.0
Simulated_3425,69.249,36.8,0.0,0.0,1.0,0.0,False,15.781,145.953,5.478,1.0
Simulated_1789,28.561,26.463,0.0,0.0,0.0,0.0,False,12.198,124.091,4.87,1.0
Simulated_1592,70.278,31.945,0.0,1.0,0.0,0.0,False,15.609,142.038,2.492,0.0


In [9]:
# Calling merge() function
int_df = pd.merge(pheno_df_train, pheno_df_test, how ='inner')
print(int_df)

Empty DataFrame
Columns: [Age, BodyMassIndex, Smoking=1, BPTreatment=1, PrevalentDiabetes=1, PrevalentCHD=1, Event, Event_time, SystolicBP, NonHDLcholesterol, Sex=1, Unnamed: 0, Smoking, BPTreatment, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, Sex]
Index: []


In [12]:
import scipy.stats  as stats
crosstab = pd.crosstab(pheno_df_train['Smoking=1'], pheno_df_train['Event'])
crosstab

Event,False,True
Smoking=1,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2406,237
1.0,751,54


In [13]:
stats.chi2_contingency(crosstab)

(3.787912668503087,
 0.051623988357987226,
 1,
 array([[2419.93938515,  223.06061485],
        [ 737.06061485,   67.93938515]]))

In [14]:
crosstab = pd.crosstab(pheno_df_train['BPTreatment=1'], pheno_df_train['Event'])
crosstab

Event,False,True
BPTreatment=1,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2674,238
1.0,483,53


In [15]:
stats.chi2_contingency(crosstab)

(1.508172147740657,
 0.21941821239553866,
 1,
 array([[2666.23665893,  245.76334107],
        [ 490.76334107,   45.23665893]]))

In [16]:
crosstab = pd.crosstab(pheno_df_train['PrevalentDiabetes=1'], pheno_df_train['Event'])
crosstab

Event,False,True
PrevalentDiabetes=1,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2964,271
1.0,193,20


In [17]:
stats.chi2_contingency(crosstab)

(0.1503005602504273,
 0.6982482975121406,
 1,
 array([[2961.97650812,  273.02349188],
        [ 195.02349188,   17.97650812]]))

In [18]:
crosstab = pd.crosstab(pheno_df_train['Sex=1'], pheno_df_train['Event'])
crosstab

Event,False,True
Sex=1,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1758,156
1.0,1399,135


In [19]:
stats.chi2_contingency(crosstab)

(0.385329178859515,
 0.5347648592328675,
 1,
 array([[1752.46461717,  161.53538283],
        [1404.53538283,  129.46461717]]))

In [20]:
crosstab = pd.crosstab(pheno_df_train['PrevalentCHD=1'], pheno_df_train['Event'])
crosstab

Event,False,True
PrevalentCHD=1,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3063,274
1.0,94,17


In [21]:
stats.chi2_contingency(crosstab)

(6.127352710227614,
 0.013310603158111651,
 1,
 array([[3055.36803944,  281.63196056],
        [ 101.63196056,    9.36803944]]))

In [27]:
from sklearn.linear_model import LogisticRegression

lgc = LogisticRegression().fit(pheno_df_train['BodyMassIndex'].to_numpy().reshape(-1,1), pheno_df_train['Event'])
 