In [210]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# 5 Questions
- Predict if patient survives stroke or not.
- Predict if patient will have disability.
- What are the main factors contributing to death?
- How does our model's predicted probability of death at 14 days/6 months align with the doctors predictions?
- What are positive factors for stroke survival?

In [211]:
stroke_trials_df = pd.read_csv('Resources/V2_International_Stroke_Trials.csv', low_memory=False)
stroke_trials_df.head()

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RASP3,RHEP24,RSBP,RCONSC,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
0,M,69,Y,,Y,Y,,,140,D,...,,N,0.0,E,2,,27,0.698,0.2344,0.1054
1,M,76,Y,,Y,N,,,150,F,...,,N,0.0,A,2,,27,0.5389,0.1555,0.0421
2,F,71,N,,Y,N,,,170,F,...,,N,0.0,A,2,,27,0.5275,0.1009,0.0323
3,M,81,N,,N,N,,,170,F,...,,N,0.0,A,4,,27,0.4021,0.1147,0.0244
4,M,78,N,,N,N,,,170,F,...,,N,0.0,E,2,,27,0.56,0.1709,0.0441


In [212]:
print(stroke_trials_df.columns)

Index(['SEX', 'AGE', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RASP3', 'RHEP24',
       'RSBP', 'RCONSC', 'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6',
       'RDEF7', 'RDEF8', 'STYPE', 'RXASP', 'RXHEP', 'DASP14', 'DASPLT',
       'DLH14', 'DMH14', 'DHH14', 'ONDRUG', 'DSCH', 'DIVH', 'DAP', 'DOAC',
       'DGORM', 'DSTER', 'DCAA', 'DHAEMD', 'DCAREND', 'DTHROMB', 'DMAJNCH',
       'DDIAGISC', 'DDIAGHA', 'DDIAGUN', 'DNOSTRK', 'DRSISC', 'DRSH', 'DRSUNK',
       'DPE', 'DALIVE', 'DPLACE', 'DDEAD', 'DDEADC', 'FPLACE', 'OCCODE',
       'FDEADC', 'CNTRYNUM', 'EXPDD', 'EXPD6', 'EXPD14'],
      dtype='object')


In [213]:
stroke_trials_df['DCAREND'].value_counts()

DCAREND
N    18352
Y       50
U       28
Name: count, dtype: int64

In [214]:
# Change all lowercase letters into uppercase
stroke_trials_df = stroke_trials_df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
stroke_trials_df.head(10)

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RASP3,RHEP24,RSBP,RCONSC,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
0,M,69,Y,,Y,Y,,,140,D,...,,N,0.0,E,2,,27,0.698,0.2344,0.1054
1,M,76,Y,,Y,N,,,150,F,...,,N,0.0,A,2,,27,0.5389,0.1555,0.0421
2,F,71,N,,Y,N,,,170,F,...,,N,0.0,A,2,,27,0.5275,0.1009,0.0323
3,M,81,N,,N,N,,,170,F,...,,N,0.0,A,4,,27,0.4021,0.1147,0.0244
4,M,78,N,,N,N,,,170,F,...,,N,0.0,E,2,,27,0.56,0.1709,0.0441
5,M,54,N,,Y,N,,,135,F,...,,N,0.0,A,3,,27,0.3132,0.0471,0.0244
6,F,77,N,,N,N,,,140,F,...,,N,0.0,A,3,,27,0.7285,0.1873,0.053
7,M,23,N,,Y,N,,,120,F,...,,N,0.0,A,3,,27,0.15,0.0038,0.0054
8,M,47,N,,N,N,,,150,F,...,,N,0.0,A,3,,27,0.1334,0.0162,0.0102
9,M,81,Y,,N,N,,,170,F,...,,N,0.0,A,2,,27,0.4964,0.1476,0.0337


In [215]:
stroke_trials_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RASP3,RHEP24,RSBP,RCONSC,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,19435,19435.0,19435,18451,19435,19435,0.0,19091,19435.0,19435,...,9726,19416,2334.0,14933,19435.0,4364.0,19435.0,19435.0,19435.0,19435.0
unique,2,,2,2,2,2,,2,,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,,N,,F,...,A,N,,A,,,,,,
freq,10407,,13750,15282,13024,13020,,18655,,14921,...,7062,17376,,11607,,,,,,
mean,,71.71541,,,,,,,160.159197,,...,,,2.404456,,2.328891,3.441797,20.965629,0.629215,0.229676,0.094167
std,,11.619714,,,,,,,27.610382,,...,,,2.172007,,1.068117,2.382524,8.43951,0.230564,0.194553,0.104473
min,,16.0,,,,,,,70.0,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.0035,0.005
25%,,65.0,,,,,,,140.0,,...,,,1.0,,2.0,1.0,14.0,0.4391,0.0846,0.0285
50%,,73.0,,,,,,,160.0,,...,,,1.0,,2.0,4.0,25.0,0.6378,0.1601,0.0471
75%,,80.0,,,,,,,180.0,,...,,,4.0,,3.0,5.0,27.0,0.83675,0.3192,0.1131


In [216]:
# Find the number of na values in each column
for col in stroke_trials_df.columns:
    if stroke_trials_df[col].isna().sum() > 0:
        print(f'{col}: {stroke_trials_df[col].isna().sum()}')

RATRIAL: 984
RASP3: 19435
RHEP24: 344
DASP14: 22
DASPLT: 147
DLH14: 22
DMH14: 1006
DHH14: 18451
ONDRUG: 1
DSCH: 305
DIVH: 305
DAP: 18
DOAC: 18
DGORM: 23
DSTER: 28
DCAA: 29
DHAEMD: 28
DCAREND: 1005
DTHROMB: 315
DMAJNCH: 14
DDIAGISC: 23
DDIAGHA: 26
DDIAGUN: 23
DNOSTRK: 26
DRSISC: 18
DRSH: 15
DRSUNK: 1007
DPE: 14
DALIVE: 28
DPLACE: 9709
DDEAD: 19
DDEADC: 17101
FPLACE: 4502
FDEADC: 15071


In [217]:
#Drop the RASP3 column
stroke_trials_df.drop(columns=['RASP3'], inplace=True)

In [218]:
# Fill all blank columns, that use C, into C
C_columns = ['RATRIAL', 'RHEP24']
for col in C_columns:
    stroke_trials_df[col] = stroke_trials_df[col].fillna('C')

In [219]:
# Fill all blank columns, that use u, into U
U_columns = ['DASP14', 'DASPLT', 'DLH14', 'DMH14', 'DHH14', 'DSCH', 'DIVH', 'DAP', 'DOAC', 'DGORM', 'DSTER',
             'DCAA', 'DHAEMD', 'DCAREND', 'DTHROMB', 'DMAJNCH', 'DDIAGISC', 'DDIAGHA', 'DDIAGUN', 'DNOSTRK', 'DRSISC', 'DRSH',
             'DRSUNK', 'DPE', 'DALIVE', 'DPLACE', 'DDEAD', 'FPLACE']
for col in U_columns:
    stroke_trials_df[col] = stroke_trials_df[col].fillna('U')

In [220]:
# Fill all blank columns, that are numerical, into 0
numeric_columns = ['DDEADC', 'ONDRUG', 'FDEADC']
for col in numeric_columns:
    stroke_trials_df[col] = stroke_trials_df[col].fillna(0)

In [221]:
# Find the number of remaining na values in each column
for col in stroke_trials_df.columns:
    if stroke_trials_df[col].isna().sum() > 0:
        print(f'{col}: {stroke_trials_df[col].isna().sum()}')

In [222]:
# Drop all rows with na values
stroke_trials_dropna_df = stroke_trials_df.dropna()
stroke_trials_dropna_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,19435,19435.0,19435,19435,19435,19435,19435,19435.0,19435,19435,...,19435,19435,19435.0,19435,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0
unique,2,,2,3,2,2,3,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,10407,,13750,15282,13024,13020,18655,,14921,14099,...,9729,17376,,11607,,,,,,
mean,,71.71541,,,,,,160.159197,,,...,,,0.288757,,2.328891,0.772833,20.965629,0.629215,0.229676,0.094167
std,,11.619714,,,,,,27.610382,,,...,,,1.085031,,1.068117,1.826785,8.43951,0.230564,0.194553,0.104473
min,,16.0,,,,,,70.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.0035,0.005
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.4391,0.0846,0.0285
50%,,73.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6378,0.1601,0.0471
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.83675,0.3192,0.1131


In [223]:
# Filter out rows that have 'Y' in the Heparin columns
heparin_columns = ['RHEP24', 'RXHEP', 'DLH14', 'DMH14', 'DHH14', 'DSCH', 'DIVH']
for col in heparin_columns:
    stroke_trials_dropna_df = stroke_trials_dropna_df[stroke_trials_dropna_df[col] != 'Y']
    stroke_trials_dropna_df = stroke_trials_dropna_df.drop(columns=[col])

stroke_trials_remove_hep_df = stroke_trials_dropna_df.copy()
stroke_trials_remove_hep_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RSBP,RCONSC,RDEF1,RDEF2,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,9773,9773.0,9773,9773,9773,9773,9773.0,9773,9773,9773,...,9773,9773,9773.0,9773,9773.0,9773.0,9773.0,9773.0,9773.0,9773.0
unique,2,,2,3,2,2,,3,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,,F,Y,Y,...,U,N,,A,,,,,,
freq,5205,,6904,7642,6286,6655,,7506,7042,8342,...,4848,8711,,5764,,,,,,
mean,,71.826358,,,,,160.236877,,,,...,,,0.283332,,2.323442,0.763328,21.285583,0.6312,0.231756,0.094968
std,,11.666301,,,,,27.92906,,,,...,,,1.061092,,1.071941,1.80404,8.340225,0.231622,0.195793,0.105152
min,,19.0,,,,,71.0,,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,140.0,,,,...,,,0.0,,2.0,0.0,14.0,0.4391,0.0851,0.0284
50%,,74.0,,,,,160.0,,,,...,,,0.0,,2.0,0.0,25.0,0.6422,0.1619,0.0476
75%,,80.0,,,,,180.0,,,,...,,,0.0,,3.0,0.0,27.0,0.8401,0.3227,0.115


In [225]:
stroke_trials_remove_hep_df['DCAREND'].value_counts()

DCAREND
N    9176
U     582
Y      15
Name: count, dtype: int64

In [226]:
stroke_trials_remove_hep_df['DTHROMB'].value_counts()

DTHROMB
N    9568
U     200
Y       5
Name: count, dtype: int64

In [227]:
# Drop DCAREND and DTHROMB columns
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df.drop(columns=['DCAREND', 'DTHROMB'])

In [228]:
# Create dataFrames of target variables
target_14_days_df = stroke_trials_remove_hep_df[['DALIVE', 'DPLACE', 'DDEAD', 'DDEADC']].copy()
target_6_months_df = stroke_trials_remove_hep_df[['FPLACE', 'OCCODE', 'FDEADC']].copy()
target_14_days_df.head()

Unnamed: 0,DALIVE,DPLACE,DDEAD,DDEADC
0,N,U,N,0.0
2,Y,U,N,0.0
5,Y,U,N,0.0
6,N,U,N,0.0
9,Y,U,N,0.0


In [229]:
# See the value counts of each column in the target_14_days_df
for col in target_14_days_df.columns:
    print(f"{target_14_days_df[col].value_counts()}")

DALIVE
Y    5224
N    4524
U      25
Name: count, dtype: int64
DPLACE
U    4848
A    3573
E     801
D     271
C     141
B     139
Name: count, dtype: int64
DDEAD
N    8711
Y    1041
U      21
Name: count, dtype: int64
DDEADC
0.0    8733
1.0     521
4.0     194
2.0     113
5.0      77
6.0      45
7.0      44
8.0      29
3.0      17
Name: count, dtype: int64


- Drop rows with 'U' in DALIVE and DDEAD columns (only small number of rows)
- Decide what to do with DPLACE and DDEADC columns (drop DPLACE column, drop DDEADC column, drop 'U' rows in DPLACE, drop '0' rows in DDEADC, or a combination of those) - Going to drop both columns for now. 

In [230]:
# View the DPLACE, DALIVE, and DDEADC value counts for rows that have 'U' in the DDEAD column
ddead = stroke_trials_remove_hep_df[stroke_trials_remove_hep_df['DDEAD'] == 'U']
ddead[['DPLACE','DALIVE','DDEADC']].value_counts()

DPLACE  DALIVE  DDEADC
U       U       0.0       19
        N       0.0        2
Name: count, dtype: int64

In [231]:
# Drop rows that have 'U' in the DALIVE and DDEAD columns
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df[(stroke_trials_remove_hep_df['DALIVE'] != 'U') & (stroke_trials_remove_hep_df['DDEAD'] != 'U')]
stroke_trials_remove_hep_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RSBP,RCONSC,RDEF1,RDEF2,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,9746,9746.0,9746,9746,9746,9746,9746.0,9746,9746,9746,...,9746,9746,9746.0,9746,9746.0,9746.0,9746.0,9746.0,9746.0,9746.0
unique,2,,2,3,2,2,,3,3,3,...,6,2,,6,,,,,,
top,M,,N,N,Y,N,,F,Y,Y,...,U,N,,A,,,,,,
freq,5191,,6885,7621,6270,6638,,7489,7024,8317,...,4821,8711,,5757,,,,,,
mean,,71.818592,,,,,160.250564,,,,...,,,0.282577,,2.324236,0.760825,21.266981,0.631035,0.231545,0.094838
std,,11.668252,,,,,27.929285,,,,...,,,1.05953,,1.067329,1.801928,8.341747,0.231549,0.195604,0.104976
min,,19.0,,,,,71.0,,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,140.0,,,,...,,,0.0,,2.0,0.0,14.0,0.439025,0.0851,0.0284
50%,,74.0,,,,,160.0,,,,...,,,0.0,,2.0,0.0,25.0,0.642,0.16165,0.0476
75%,,80.0,,,,,180.0,,,,...,,,0.0,,3.0,0.0,27.0,0.839575,0.3222,0.114875


In [232]:
# Drop DPLACE and DDEADC
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df.drop(columns=['DPLACE', 'DDEADC'])

In [233]:
# See the value counts of each column in the target_6_months_df
for col in target_6_months_df.columns:
    print(f"{target_6_months_df[col].value_counts()}")

FPLACE
A    5764
U    2316
D     654
B     448
C     352
E     239
Name: count, dtype: int64
OCCODE
2    3969
1    2148
3    1957
4    1620
0      49
9      30
Name: count, dtype: int64
FDEADC
0.0    7567
1.0     800
4.0     448
2.0     271
7.0     237
5.0     195
8.0     132
6.0      89
3.0      34
Name: count, dtype: int64


- Drop rows with '0' or '9' in OCCODE column (only small number of rows)
- Decide what to do with FPLACE and FDEADC columns (drop FPLACE column, drop FDEADC column, drop rows with 'U' in FPLACE, drop rows with '0' in FDEADC, or combination of those) - Going to drop both columns for now.

In [234]:
# View the FPLACE and OCCODE value counts for rows that have 0 in the FDEADC column
fdeadc = stroke_trials_remove_hep_df[stroke_trials_remove_hep_df['FDEADC'] == 0]
fdeadc[['FPLACE','OCCODE']].value_counts()

FPLACE  OCCODE
A       2         2472
        3         1788
        4         1497
D       2          630
B       2          293
C       2          263
E       2          219
B       3           96
        4           59
C       3           48
U       0           47
C       4           41
U       9           28
        2           15
        1           13
D       3           12
        4           11
E       3           10
        4           10
U       3            1
        4            1
Name: count, dtype: int64

In [235]:
# Drop rows with '0' in the OCCODE column
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df[(stroke_trials_remove_hep_df['OCCODE'] != 0) & (stroke_trials_remove_hep_df['OCCODE'] != 9)]
stroke_trials_remove_hep_df['OCCODE'].nunique()

4

In [236]:
#Replace 3 and 4 with 2 in OCCODE column
stroke_trials_remove_hep_df['OCCODE'] = stroke_trials_remove_hep_df['OCCODE'].replace({3:2, 4:2})
stroke_trials_remove_hep_df['OCCODE'].value_counts()

OCCODE
2    7536
1    2135
Name: count, dtype: int64

In [237]:
# Drop FPLACE and FDEADC columns
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df.drop(columns=['FPLACE', 'FDEADC'])

In [238]:
stroke_trials_remove_hep_df.columns

Index(['SEX', 'AGE', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RSBP', 'RCONSC',
       'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7', 'RDEF8',
       'STYPE', 'RXASP', 'DASP14', 'DASPLT', 'ONDRUG', 'DAP', 'DOAC', 'DGORM',
       'DSTER', 'DCAA', 'DHAEMD', 'DMAJNCH', 'DDIAGISC', 'DDIAGHA', 'DDIAGUN',
       'DNOSTRK', 'DRSISC', 'DRSH', 'DRSUNK', 'DPE', 'DALIVE', 'DDEAD',
       'OCCODE', 'CNTRYNUM', 'EXPDD', 'EXPD6', 'EXPD14'],
      dtype='object')

Potential Later Options:
- Create fourteen_day_df with DDEAD, DALIVE, DDEADC, or DPLACE as the y
- Replace OCCODE with FPLACE or FDEADC as the y

In [239]:
# Get important columns for the model
six_month_df = stroke_trials_remove_hep_df[['SEX', 'AGE', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RSBP', 'RCONSC',
       'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7', 'RDEF8',
       'STYPE', 'RXASP', 'DASP14', 'DASPLT', 'ONDRUG', 'DAP', 'DOAC', 'DGORM',
       'DSTER', 'DCAA', 'DHAEMD', 'DMAJNCH', 'DDIAGISC', 'DDIAGHA', 'DDIAGUN',
       'DNOSTRK', 'DRSISC', 'DRSH', 'DRSUNK', 'DPE', 'DALIVE', 'CNTRYNUM', 'OCCODE']]
six_month_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9671 entries, 0 to 19433
Data columns (total 39 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEX       9671 non-null   object 
 1   AGE       9671 non-null   int64  
 2   RSLEEP    9671 non-null   object 
 3   RATRIAL   9671 non-null   object 
 4   RCT       9671 non-null   object 
 5   RVISINF   9671 non-null   object 
 6   RSBP      9671 non-null   int64  
 7   RCONSC    9671 non-null   object 
 8   RDEF1     9671 non-null   object 
 9   RDEF2     9671 non-null   object 
 10  RDEF3     9671 non-null   object 
 11  RDEF4     9671 non-null   object 
 12  RDEF5     9671 non-null   object 
 13  RDEF6     9671 non-null   object 
 14  RDEF7     9671 non-null   object 
 15  RDEF8     9671 non-null   object 
 16  STYPE     9671 non-null   object 
 17  RXASP     9671 non-null   object 
 18  DASP14    9671 non-null   object 
 19  DASPLT    9671 non-null   object 
 20  ONDRUG    9671 non-null   float64


In [240]:
six_month_df.head()

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RSBP,RCONSC,RDEF1,RDEF2,...,DDIAGHA,DDIAGUN,DNOSTRK,DRSISC,DRSH,DRSUNK,DPE,DALIVE,CNTRYNUM,OCCODE
0,M,69,Y,C,Y,Y,140,D,N,N,...,N,N,N,N,N,U,N,N,27,2
2,F,71,N,C,Y,N,170,F,Y,Y,...,N,N,N,N,N,U,N,Y,27,2
5,M,54,N,C,Y,N,135,F,Y,Y,...,N,N,N,N,N,U,N,Y,27,2
6,F,77,N,C,N,N,140,F,Y,Y,...,N,N,Y,N,N,U,N,N,27,2
9,M,81,Y,C,N,N,170,F,N,N,...,N,N,N,N,N,U,N,Y,27,2


In [241]:
# Create csv files for the six month data
six_month_df.to_csv('Resources/six_month_data.csv', index=False)

In [242]:
# Create dataFrame of doctor's predicted probabilities
doctor_pred_probs_df = stroke_trials_remove_hep_df[['EXPDD','EXPD6']].copy()
doctor_pred_probs_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9671 entries, 0 to 19433
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   EXPDD   9671 non-null   float64
 1   EXPD6   9671 non-null   float64
dtypes: float64(2)
memory usage: 226.7 KB


In [243]:
# Create csv file for doctor's predicted probabilities
doctor_pred_probs_df.to_csv('Resources/doctor_6month_pred.csv', index=False)