In [234]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 5 Questions
- Predict if patient survives stroke or not.
- Predict if patient will have disability.
- What are the main factors contributing to death?
- How does our model's predicted probability of death at 14 days/6 months align with the doctors predictions?
- What are positive factors for stroke survival?

In [235]:
stroke_trials_df = pd.read_csv('Resources/V2_International_Stroke_Trials.csv', low_memory=False)
stroke_trials_df.head()

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
0,M,69,Y,,Y,Y,,140,D,N,...,,N,0.0,E,2,,27,0.698,0.2344,0.1054
1,M,76,Y,,Y,N,,150,F,Y,...,,N,0.0,A,2,,27,0.5389,0.1555,0.0421
2,F,71,N,,Y,N,,170,F,Y,...,,N,0.0,A,2,,27,0.5275,0.1009,0.0323
3,M,81,N,,N,N,,170,F,N,...,,N,0.0,A,4,,27,0.4021,0.1147,0.0244
4,M,78,N,,N,N,,170,F,Y,...,,N,0.0,E,2,,27,0.56,0.1709,0.0441


In [236]:
print(stroke_trials_df.columns)

Index(['SEX', 'AGE', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RHEP24', 'RSBP',
       'RCONSC', 'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7',
       'RDEF8', 'STYPE', 'RDATE', 'RXASP', 'RXHEP', 'DASP14', 'DASPLT',
       'DLH14', 'DMH14', 'DHH14', 'ONDRUG', 'DSCH', 'DIVH', 'DAP', 'DOAC',
       'DGORM', 'DSTER', 'DCAA', 'DHAEMD', 'DCAREND', 'DTHROMB', 'DMAJNCH',
       'DDIAGISC', 'DDIAGHA', 'DDIAGUN', 'DNOSTRK', 'DRSISC', 'DRSH', 'DRSUNK',
       'DPE', 'DALIVE', 'DPLACE', 'DDEAD', 'DDEADC', 'FPLACE', 'OCCODE',
       'FDEADC', 'CNTRYNUM', 'EXPDD', 'EXPD6', 'EXPD14'],
      dtype='object')


In [237]:
stroke_trials_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,19435,19435.0,19435,18451,19435,19435,19091,19435.0,19435,19435,...,9726,19416,2334.0,14933,19435.0,4364.0,19435.0,19435.0,19435.0,19435.0
unique,2,,2,2,2,2,2,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,A,N,,A,,,,,,
freq,10407,,13750,15282,13024,13020,18655,,14921,14099,...,7062,17376,,11607,,,,,,
mean,,71.71541,,,,,,160.159197,,,...,,,2.404456,,2.328891,3.441797,20.965629,0.629215,0.229676,0.094167
std,,11.619714,,,,,,27.610382,,,...,,,2.172007,,1.068117,2.382524,8.43951,0.230564,0.194553,0.104473
min,,16.0,,,,,,70.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.0035,0.005
25%,,65.0,,,,,,140.0,,,...,,,1.0,,2.0,1.0,14.0,0.4391,0.0846,0.0285
50%,,73.0,,,,,,160.0,,,...,,,1.0,,2.0,4.0,25.0,0.6378,0.1601,0.0471
75%,,80.0,,,,,,180.0,,,...,,,4.0,,3.0,5.0,27.0,0.83675,0.3192,0.1131


In [238]:
# Find the number of na values in each column
for col in stroke_trials_df.columns:
    if stroke_trials_df[col].isna().sum() > 0:
        print(f'{col}: {stroke_trials_df[col].isna().sum()}')

RATRIAL: 984
RHEP24: 344
DASP14: 22
DASPLT: 147
DLH14: 22
DMH14: 1006
DHH14: 18451
ONDRUG: 1
DSCH: 305
DIVH: 305
DAP: 18
DOAC: 18
DGORM: 23
DSTER: 28
DCAA: 29
DHAEMD: 28
DCAREND: 1005
DTHROMB: 315
DMAJNCH: 14
DDIAGISC: 23
DDIAGHA: 26
DDIAGUN: 23
DNOSTRK: 26
DRSISC: 18
DRSH: 15
DRSUNK: 1007
DPE: 14
DALIVE: 28
DPLACE: 9709
DDEAD: 19
DDEADC: 17101
FPLACE: 4502
FDEADC: 15071


In [239]:
# Fill NaN cells with '0' in columns where 0=Unknown, and fill NaN cells with 'U' in columns where U=Unknown.
stroke_trials_df['DDEADC'] = stroke_trials_df['DDEADC'].fillna(0)
stroke_trials_df['DASP14'] = stroke_trials_df['DASP14'].fillna('U')
stroke_trials_df['DASPLT'] = stroke_trials_df['DASPLT'].fillna('U')
stroke_trials_df['DRSUNK'] = stroke_trials_df['DRSUNK'].fillna('U')
stroke_trials_df['DPLACE'] = stroke_trials_df['DPLACE'].fillna('U')
stroke_trials_df['FPLACE'] = stroke_trials_df['FPLACE'].fillna('U')
stroke_trials_df['DMH14'] = stroke_trials_df['DMH14'].fillna('U')
stroke_trials_df['DHH14'] = stroke_trials_df['DHH14'].fillna('U')
stroke_trials_df['DLH14'] = stroke_trials_df['DLH14'].fillna('U')
stroke_trials_df['DSCH'] = stroke_trials_df['DSCH'].fillna('U')
stroke_trials_df['DIVH'] = stroke_trials_df['DIVH'].fillna('U')
stroke_trials_df['DCAREND'] = stroke_trials_df['DCAREND'].fillna('U')
stroke_trials_df['DTHROMB'] = stroke_trials_df['DTHROMB'].fillna('U')
stroke_trials_df['DALIVE'] = stroke_trials_df['DALIVE'].fillna('U')
stroke_trials_df['DAP'] = stroke_trials_df['DAP'].fillna('U')
stroke_trials_df['FDEADC'] = stroke_trials_df['FDEADC'].fillna(0)
stroke_trials_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,19435,19435.0,19435,18451,19435,19435,19091,19435.0,19435,19435,...,19435,19416,19435.0,19435,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0
unique,2,,2,2,2,2,2,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,10407,,13750,15282,13024,13020,18655,,14921,14099,...,9729,17376,,11607,,,,,,
mean,,71.71541,,,,,,160.159197,,,...,,,0.288757,,2.328891,0.772833,20.965629,0.629215,0.229676,0.094167
std,,11.619714,,,,,,27.610382,,,...,,,1.085031,,1.068117,1.826785,8.43951,0.230564,0.194553,0.104473
min,,16.0,,,,,,70.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.0035,0.005
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.4391,0.0846,0.0285
50%,,73.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6378,0.1601,0.0471
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.83675,0.3192,0.1131


In [240]:
# Find the number of remaining na values in each column
for col in stroke_trials_df.columns:
    if stroke_trials_df[col].isna().sum() > 0:
        print(f'{col}: {stroke_trials_df[col].isna().sum()}')

RATRIAL: 984
RHEP24: 344
ONDRUG: 1
DOAC: 18
DGORM: 23
DSTER: 28
DCAA: 29
DHAEMD: 28
DMAJNCH: 14
DDIAGISC: 23
DDIAGHA: 26
DDIAGUN: 23
DNOSTRK: 26
DRSISC: 18
DRSH: 15
DPE: 14
DDEAD: 19


In [241]:
# View the rows that have NaN values in RHEP24 column and see how many have 'N', 'U', or NaN values in the other heparin columns (RXHEP, DLH14, DHH14, DMH14, DSCH, DIVH)
rhep24 = stroke_trials_df.loc[stroke_trials_df['RHEP24'].isnull()]
rhep24[['RXHEP','DLH14','DHH14','DMH14','DSCH','DIVH']].value_counts()

RXHEP  DLH14  DHH14  DMH14  DSCH  DIVH
N      N      N      U      U     U       147
L      Y      N      U      U     U        61
H      N      Y      U      U     U        58
N      N      N      U      N     N        23
L      Y      N      U      N     N        13
H      N      N      U      U     U        11
              Y      U      N     N        10
L      N      N      U      U     U         7
H      N      N      U      N     N         5
       Y      N      U      N     N         2
L      N      Y      U      N     N         1
       U      N      U      U     U         1
       N      N      U      N     U         1
                                  N         1
       Y      N      U      Y     N         1
H      Y      N      U      U     U         1
N      N      N      U      Y     Y         1
Name: count, dtype: int64

There are 173 rows that we can use if we save the NaN rows in the 'RHEP24' column (147 + 23 + 1 + 1 + 1)

In [242]:
# Fill na values in RHEP24 column with 'U'
stroke_trials_df['RHEP24'] = stroke_trials_df['RHEP24'].fillna('U')

In [243]:
# Drop all rows with na values
stroke_trials_dropna_df = stroke_trials_df.dropna()
stroke_trials_dropna_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,18405,18405.0,18405,18405,18405,18405,18405,18405.0,18405,18405,...,18405,18405,18405.0,18405,18405.0,18405.0,18405.0,18405.0,18405.0,18405.0
unique,2,,2,2,2,2,2,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,9825,,13011,15243,12372,12382,17977,,14112,13363,...,9058,16481,,10956,,,,,,
mean,,71.802934,,,,,,160.213583,,,...,,,0.287313,,2.327574,0.775061,21.050801,0.631258,0.231408,0.095004
std,,11.609051,,,,,,27.724123,,,...,,,1.08272,,1.068661,1.830083,8.437599,0.230384,0.195796,0.105377
min,,16.0,,,,,,70.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.0035,0.005
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.4415,0.085,0.0286
50%,,73.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.641,0.1613,0.0473
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.8389,0.3222,0.1143


In [244]:
# Filter out rows that have 'Y' in the RHEP24 column
stroke_trials_rhep24_df = stroke_trials_dropna_df[stroke_trials_dropna_df['RHEP24'] != 'Y']
stroke_trials_rhep24_df = stroke_trials_rhep24_df.reset_index(drop=True)
stroke_trials_rhep24_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,17977,17977.0,17977,17977,17977,17977,17977,17977.0,17977,17977,...,17977,17977,17977.0,17977,17977.0,17977.0,17977.0,17977.0,17977.0,17977.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,9627,,12687,14912,12039,12115,17977,,13784,13034,...,8858,16095,,10694,,,,,,
mean,,71.787006,,,,,,160.310174,,,...,,,0.287367,,2.328865,0.7736,21.154086,0.630842,0.231241,0.094947
std,,11.623209,,,,,,27.749877,,,...,,,1.082168,,1.070754,1.827973,8.415366,0.230713,0.195795,0.10534
min,,16.0,,,,,,70.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.0035,0.005
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.4402,0.0849,0.0285
50%,,73.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6404,0.1613,0.0473
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.8386,0.3218,0.1142


In [245]:
# Filter out rows that have 'Y' in the RXHEP column
stroke_trials_rxhep_df = stroke_trials_rhep24_df[stroke_trials_rhep24_df['RXHEP'] != 'Y']
stroke_trials_rxhep_df = stroke_trials_rxhep_df.drop(columns=['RXHEP'])
stroke_trials_rxhep_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,17977,17977.0,17977,17977,17977,17977,17977,17977.0,17977,17977,...,17977,17977,17977.0,17977,17977.0,17977.0,17977.0,17977.0,17977.0,17977.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,9627,,12687,14912,12039,12115,17977,,13784,13034,...,8858,16095,,10694,,,,,,
mean,,71.787006,,,,,,160.310174,,,...,,,0.287367,,2.328865,0.7736,21.154086,0.630842,0.231241,0.094947
std,,11.623209,,,,,,27.749877,,,...,,,1.082168,,1.070754,1.827973,8.415366,0.230713,0.195795,0.10534
min,,16.0,,,,,,70.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.0035,0.005
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.4402,0.0849,0.0285
50%,,73.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6404,0.1613,0.0473
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.8386,0.3218,0.1142


In [246]:
# Filter out rows that have 'Y' in the DLH14 column
stroke_trials_dlh14_df = stroke_trials_rxhep_df[stroke_trials_rxhep_df['DLH14'] != 'Y']
stroke_trials_dlh14_df = stroke_trials_dlh14_df.drop(columns=['DLH14'])
stroke_trials_dlh14_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,13768,13768.0,13768,13768,13768,13768,13768,13768.0,13768,13768,...,13768,13768,13768.0,13768,13768.0,13768.0,13768.0,13768.0,13768.0,13768.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,7360,,9718,11397,9151,9259,13768,,10552,9974,...,6799,12305,,8159,,,,,,
mean,,71.765035,,,,,,160.071252,,,...,,,0.291618,,2.324303,0.776801,21.202571,0.631075,0.231324,0.095039
std,,11.644272,,,,,,27.720367,,,...,,,1.088853,,1.068681,1.828388,8.396093,0.230859,0.195794,0.105339
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.4411,0.085,0.0285
50%,,73.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.63975,0.16145,0.04735
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.8389,0.322025,0.1146


In [247]:
# Filer out rows that have 'Y' in the DMH14 column
stroke_trials_dmh14_df = stroke_trials_dlh14_df[stroke_trials_dlh14_df['DMH14'] != 'Y']
stroke_trials_dmh14_df = stroke_trials_dmh14_df.drop(columns=['DMH14'])
stroke_trials_dmh14_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,9783,9783.0,9783,9783,9783,9783,9783,9783.0,9783,9783,...,9783,9783,9783.0,9783,9783.0,9783.0,9783.0,9783.0,9783.0,9783.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,5203,,6915,8094,6386,6658,9783,,7490,7076,...,4812,8745,,5751,,,,,,
mean,,71.780435,,,,,,160.170807,,,...,,,0.284984,,2.317387,0.768169,21.296228,0.633252,0.232819,0.095676
std,,11.709731,,,,,,28.000386,,,...,,,1.070862,,1.069145,1.814777,8.363065,0.231238,0.196266,0.10556
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.44155,0.0855,0.0286
50%,,74.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6452,0.1627,0.048
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.84155,0.3252,0.1161


In [248]:
# Filter out rows that have 'Y' in the DHH14 column
stroke_trials_dhh14_df = stroke_trials_dmh14_df[stroke_trials_dmh14_df['DHH14'] != 'Y']
stroke_trials_dhh14_df = stroke_trials_dhh14_df.drop(columns=['DHH14'])
stroke_trials_dhh14_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,9783,9783.0,9783,9783,9783,9783,9783,9783.0,9783,9783,...,9783,9783,9783.0,9783,9783.0,9783.0,9783.0,9783.0,9783.0,9783.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,5203,,6915,8094,6386,6658,9783,,7490,7076,...,4812,8745,,5751,,,,,,
mean,,71.780435,,,,,,160.170807,,,...,,,0.284984,,2.317387,0.768169,21.296228,0.633252,0.232819,0.095676
std,,11.709731,,,,,,28.000386,,,...,,,1.070862,,1.069145,1.814777,8.363065,0.231238,0.196266,0.10556
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.44155,0.0855,0.0286
50%,,74.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6452,0.1627,0.048
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.84155,0.3252,0.1161


In [249]:
# Filter out rows that have 'Y' in the DSCH column
stroke_trials_dsch_df = stroke_trials_dhh14_df[stroke_trials_dhh14_df['DSCH'] != 'Y']
stroke_trials_dsch_df = stroke_trials_dsch_df.drop(columns=['DSCH'])
stroke_trials_dsch_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,9446,9446.0,9446,9446,9446,9446,9446,9446.0,9446,9446,...,9446,9446,9446.0,9446,9446.0,9446.0,9446.0,9446.0,9446.0,9446.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,5038,,6669,7823,6139,6433,9446,,7260,6805,...,4608,8447,,5568,,,,,,
mean,,71.780224,,,,,,160.222316,,,...,,,0.279166,,2.322782,0.759793,21.362799,0.631587,0.231961,0.095162
std,,11.719854,,,,,,28.011635,,,...,,,1.054313,,1.070359,1.802121,8.355987,0.231559,0.196209,0.105461
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.4398,0.085,0.0284
50%,,74.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.64225,0.1617,0.0476
75%,,80.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.840175,0.3228,0.114775


In [250]:
# Filter out rows that have 'Y' in the DIVH column
stroke_trials_divh_df = stroke_trials_dsch_df[stroke_trials_dsch_df['DIVH'] != 'Y']
stroke_trials_divh_df = stroke_trials_divh_df.drop(columns=['DIVH'])
stroke_trials_divh_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,9198,9198.0,9198,9198,9198,9198,9198,9198.0,9198,9198,...,9198,9198,9198.0,9198,9198.0,9198.0,9198.0,9198.0,9198.0,9198.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,4893,,6486,7620,5938,6282,9198,,7057,6623,...,4466,8221,,5420,,,,,,
mean,,71.887041,,,,,,160.256143,,,...,,,0.279082,,2.323222,0.76234,21.373451,0.632587,0.233059,0.095631
std,,11.659757,,,,,,28.077996,,,...,,,1.052632,,1.071138,1.804109,8.329507,0.231599,0.196902,0.105938
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,14.0,0.440625,0.0854,0.0284
50%,,74.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6431,0.1622,0.0478
75%,,81.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.841475,0.325175,0.1156


In [251]:
# Filter out rows that have 'Y' in the DHAEMD column
stroke_trials_dhaemd_df = stroke_trials_divh_df[stroke_trials_divh_df['DHAEMD'] != 'Y']
stroke_trials_dhaemd_df = stroke_trials_dhaemd_df.drop(columns=['DHAEMD'])
stroke_trials_dhaemd_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,8925,8925.0,8925,8925,8925,8925,8925,8925.0,8925,8925,...,8925,8925,8925.0,8925,8925.0,8925.0,8925.0,8925.0,8925.0,8925.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,4750,,6312,7392,5717,6097,8925,,6863,6401,...,4334,7981,,5250,,,,,,
mean,,71.967619,,,,,,160.238319,,,...,,,0.279216,,2.321008,0.75944,21.639328,0.632737,0.233161,0.095437
std,,11.64186,,,,,,28.090209,,,...,,,1.056038,,1.069802,1.799525,8.125983,0.231502,0.196667,0.105806
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,15.0,0.4407,0.0856,0.0284
50%,,74.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6443,0.1629,0.0478
75%,,81.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.8416,0.3248,0.1153


In [252]:
# Filter out rows that have 'Y' in the DCAREND column
stroke_trials_dcarend_df = stroke_trials_dhaemd_df[stroke_trials_dhaemd_df['DCAREND'] != 'Y']
stroke_trials_dcarend_df = stroke_trials_dcarend_df.drop(columns=['DCAREND'])
stroke_trials_dcarend_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,8912,8912.0,8912,8912,8912,8912,8912,8912.0,8912,8912,...,8912,8912,8912.0,8912,8912.0,8912.0,8912.0,8912.0,8912.0,8912.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,4741,,6301,7379,5708,6088,8912,,6850,6394,...,4329,7968,,5240,,,,,,
mean,,71.971836,,,,,,160.224192,,,...,,,0.279623,,2.320467,0.760548,21.64531,0.633002,0.233344,0.095528
std,,11.640598,,,,,,28.102942,,,...,,,1.056754,,1.070133,1.800604,8.126827,0.231453,0.196736,0.105854
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,15.0,0.441175,0.0857,0.0285
50%,,74.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.64455,0.163,0.0478
75%,,81.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.841725,0.325125,0.1154


In [253]:
# Filter out rows that have 'Y' in the DTHROMB column
stroke_trials_dthromb_df = stroke_trials_dcarend_df[stroke_trials_dcarend_df['DTHROMB'] != 'Y']
stroke_trials_remove_hep_df = stroke_trials_dthromb_df.drop(columns=['DTHROMB'])
stroke_trials_remove_hep_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,8907,8907.0,8907,8907,8907,8907,8907,8907.0,8907,8907,...,8907,8907,8907.0,8907,8907.0,8907.0,8907.0,8907.0,8907.0,8907.0
unique,2,,2,2,2,2,1,,3,3,...,6,3,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,4740,,6297,7377,5704,6084,8907,,6846,6390,...,4328,7963,,5235,,,,,,
mean,,71.970809,,,,,,160.227686,,,...,,,0.27978,,2.32031,0.760975,21.647693,0.632961,0.233323,0.09552
std,,11.64218,,,,,,28.108205,,,...,,,1.05703,,1.070245,1.801019,8.126888,0.231457,0.196755,0.10586
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,15.0,0.44115,0.0857,0.0285
50%,,74.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.6444,0.1629,0.0478
75%,,81.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.84165,0.3251,0.1154


In [254]:
# Create dataFrames of target variables
target_14_days_df = stroke_trials_remove_hep_df[['DALIVE', 'DPLACE', 'DDEAD', 'DDEADC']].copy()
target_6_months_df = stroke_trials_remove_hep_df[['FPLACE', 'OCCODE', 'FDEADC']].copy()
target_14_days_df.head()

Unnamed: 0,DALIVE,DPLACE,DDEAD,DDEADC
1,N,U,Y,4.0
2,Y,A,N,0.0
4,N,U,N,0.0
5,N,U,Y,4.0
6,Y,A,N,0.0


In [255]:
# See the value counts of each column in the target_14_days_df
for col in target_14_days_df.columns:
    print(f"{target_14_days_df[col].value_counts()}")

DALIVE
Y    4791
N    4105
U      11
Name: count, dtype: int64
DPLACE
U    4328
A    3325
E     735
D     261
C     137
B     121
Name: count, dtype: int64
DDEAD
N    7963
Y     938
U       6
Name: count, dtype: int64
DDEADC
0.0    7969
1.0     472
4.0     167
2.0     106
5.0      71
6.0      41
7.0      39
8.0      28
3.0      14
Name: count, dtype: int64


- Drop rows with 'U' in DALIVE and DDEAD columns (only small number of rows)
- Decide what to do with DPLACE and DDEADC columns (drop DPLACE column, drop DDEADC column, drop 'U' rows in DPLACE, drop '0' rows in DDEADC, or a combination of those) - Going to drop both columns for now. 

In [256]:
# View the DPLACE, DALIVE, and DDEADC value counts for rows that have 'U' in the DDEAD column
ddead = stroke_trials_remove_hep_df[stroke_trials_remove_hep_df['DDEAD'] == 'U']
ddead[['DPLACE','DALIVE','DDEADC']].value_counts()

DPLACE  DALIVE  DDEADC
U       U       0.0       6
Name: count, dtype: int64

In [257]:
# Drop rows that have 'U' in the DALIVE and DDEAD columns
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df[(stroke_trials_remove_hep_df['DALIVE'] != 'U') & (stroke_trials_remove_hep_df['DDEAD'] != 'U')]
stroke_trials_remove_hep_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DPLACE,DDEAD,DDEADC,FPLACE,OCCODE,FDEADC,CNTRYNUM,EXPDD,EXPD6,EXPD14
count,8896,8896.0,8896,8896,8896,8896,8896,8896.0,8896,8896,...,8896,8896,8896.0,8896,8896.0,8896.0,8896.0,8896.0,8896.0,8896.0
unique,2,,2,2,2,2,1,,3,3,...,6,2,,6,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,U,N,,A,,,,,,
freq,4733,,6289,7368,5699,6076,8896,,6839,6384,...,4317,7963,,5232,,,,,,
mean,,71.971111,,,,,,160.224483,,,...,,,0.279227,,2.319469,0.761016,21.639501,0.632939,0.233244,0.095459
std,,11.64416,,,,,,28.103412,,,...,,,1.056812,,1.065693,1.801689,8.127821,0.231377,0.196667,0.105771
min,,19.0,,,,,,71.0,,,...,,,0.0,,0.0,0.0,1.0,0.0891,0.004,0.0051
25%,,65.0,,,,,,140.0,,,...,,,0.0,,2.0,0.0,15.0,0.441275,0.0857,0.0285
50%,,74.0,,,,,,160.0,,,...,,,0.0,,2.0,0.0,25.0,0.64435,0.1629,0.0478
75%,,81.0,,,,,,180.0,,,...,,,0.0,,3.0,0.0,27.0,0.8416,0.3248,0.1154


In [258]:
# Drop DPLACE and DDEADC
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df.drop(columns=['DPLACE', 'DDEADC'])

In [259]:
# See the value counts of each column in the target_6_months_df
for col in target_6_months_df.columns:
    print(f"{target_6_months_df[col].value_counts()}")

FPLACE
A    5235
U    2109
D     618
B     402
C     337
E     206
Name: count, dtype: int64
OCCODE
2    3636
1    1954
3    1774
4    1469
0      47
9      27
Name: count, dtype: int64
FDEADC
0.0    6892
1.0     737
4.0     394
2.0     256
7.0     219
5.0     179
8.0     119
6.0      80
3.0      31
Name: count, dtype: int64


- Drop rows with '0' or '9' in OCCODE column (only small number of rows)
- Decide what to do with FPLACE and FDEADC columns (drop FPLACE column, drop FDEADC column, drop rows with 'U' in FPLACE, drop rows with '0' in FDEADC, or combination of those) - Going to drop both columns for now.

In [260]:
# View the FPLACE and OCCODE value counts for rows that have 0 in the FDEADC column
fdeadc = stroke_trials_remove_hep_df[stroke_trials_remove_hep_df['FDEADC'] == 0]
fdeadc[['FPLACE','OCCODE']].value_counts()

FPLACE  OCCODE
A       2         2250
        3         1627
        4         1355
D       2          601
B       2          262
C       2          250
E       2          192
B       3           85
        4           55
C       3           47
U       0           47
C       4           40
U       9           25
        2           13
D       4           10
E       4            8
D       3            7
E       3            6
U       1            5
        4            1
Name: count, dtype: int64

In [261]:
# Drop rows with '0' in the OCCODE column
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df[(stroke_trials_remove_hep_df['OCCODE'] != 0) & (stroke_trials_remove_hep_df['OCCODE'] != 9)]
stroke_trials_remove_hep_df['OCCODE'].nunique()

4

In [262]:
# Drop FPLACE and FDEADC columns
stroke_trials_remove_hep_df = stroke_trials_remove_hep_df.drop(columns=['FPLACE', 'FDEADC'])

In [263]:
stroke_trials_remove_hep_df.columns

Index(['SEX', 'AGE', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RHEP24', 'RSBP',
       'RCONSC', 'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7',
       'RDEF8', 'STYPE', 'RDATE', 'RXASP', 'DASP14', 'DASPLT', 'ONDRUG', 'DAP',
       'DOAC', 'DGORM', 'DSTER', 'DCAA', 'DMAJNCH', 'DDIAGISC', 'DDIAGHA',
       'DDIAGUN', 'DNOSTRK', 'DRSISC', 'DRSH', 'DRSUNK', 'DPE', 'DALIVE',
       'DDEAD', 'OCCODE', 'CNTRYNUM', 'EXPDD', 'EXPD6', 'EXPD14'],
      dtype='object')

Potential Later Options:
- Add DALIVE to six_month_df
- Create fourteen_day_df with DDEAD, DALIVE, DDEADC, or DPLACE as the y
- Replace OCCODE with FPLACE or FDEADC as the y

In [264]:
# Get important columns for the model
six_month_df = stroke_trials_remove_hep_df[['SEX', 'AGE', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RHEP24', 'RSBP',
       'RCONSC', 'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7',
       'RDEF8', 'STYPE', 'RDATE', 'RXASP', 'DASP14', 'DASPLT', 'ONDRUG', 'DAP',
       'DOAC', 'DGORM', 'DSTER', 'DCAA', 'DMAJNCH', 'DDIAGISC', 'DDIAGHA',
       'DDIAGUN', 'DNOSTRK', 'DRSISC', 'DRSH', 'DRSUNK', 'DPE', 'CNTRYNUM', 'OCCODE']]
six_month_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8824 entries, 1 to 17975
Data columns (total 39 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEX       8824 non-null   object 
 1   AGE       8824 non-null   int64  
 2   RSLEEP    8824 non-null   object 
 3   RATRIAL   8824 non-null   object 
 4   RCT       8824 non-null   object 
 5   RVISINF   8824 non-null   object 
 6   RHEP24    8824 non-null   object 
 7   RSBP      8824 non-null   int64  
 8   RCONSC    8824 non-null   object 
 9   RDEF1     8824 non-null   object 
 10  RDEF2     8824 non-null   object 
 11  RDEF3     8824 non-null   object 
 12  RDEF4     8824 non-null   object 
 13  RDEF5     8824 non-null   object 
 14  RDEF6     8824 non-null   object 
 15  RDEF7     8824 non-null   object 
 16  RDEF8     8824 non-null   object 
 17  STYPE     8824 non-null   object 
 18  RDATE     8824 non-null   object 
 19  RXASP     8824 non-null   object 
 20  DASP14    8824 non-null   object 


In [265]:
six_month_df.head()

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DDIAGISC,DDIAGHA,DDIAGUN,DNOSTRK,DRSISC,DRSH,DRSUNK,DPE,CNTRYNUM,OCCODE
1,F,73,Y,N,Y,N,N,120,F,Y,...,Y,N,N,N,N,N,N,N,27,1
2,M,74,N,N,Y,Y,N,160,F,Y,...,Y,N,N,N,N,N,N,N,14,2
4,M,54,N,N,Y,N,N,160,F,Y,...,Y,N,N,N,N,N,N,N,27,2
5,F,79,N,N,N,N,N,175,D,Y,...,N,Y,N,N,N,N,N,N,27,1
6,M,80,N,Y,Y,Y,N,200,F,Y,...,Y,N,N,N,N,N,N,N,14,2


In [266]:
# Filter object columns
columns_to_encode = six_month_df.columns[six_month_df.dtypes == 'object']
columns_to_encode

Index(['SEX', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RHEP24', 'RCONSC',
       'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7', 'RDEF8',
       'STYPE', 'RDATE', 'RXASP', 'DASP14', 'DASPLT', 'DAP', 'DOAC', 'DGORM',
       'DSTER', 'DCAA', 'DMAJNCH', 'DDIAGISC', 'DDIAGHA', 'DDIAGUN', 'DNOSTRK',
       'DRSISC', 'DRSH', 'DRSUNK', 'DPE'],
      dtype='object')

In [267]:
# View sample data in columns to encode
six_month_df[['RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7', 'RDEF8',
       'STYPE', 'RDATE', 'RXASP', 'DASP14', 'DASPLT', 'DAP', 'DOAC', 'DGORM',
       'DSTER', 'DCAA']].head()

Unnamed: 0,RDEF2,RDEF3,RDEF4,RDEF5,RDEF6,RDEF7,RDEF8,STYPE,RDATE,RXASP,DASP14,DASPLT,DAP,DOAC,DGORM,DSTER,DCAA
1,Y,Y,N,N,N,N,N,LACS,1993-03,N,N,N,N,N,N,N,N
2,Y,N,Y,N,N,N,N,PACS,1993-03,N,Y,Y,Y,N,N,N,N
4,N,N,N,N,N,Y,N,POCS,1993-03,Y,Y,Y,N,N,N,N,Y
5,Y,Y,Y,Y,Y,N,N,TACS,1993-03,N,N,N,N,N,N,N,N
6,Y,Y,Y,N,N,N,N,PACS,1993-03,N,N,Y,N,N,Y,N,Y


In [268]:
#Transform the RDATE column to datetime
pd.options.mode.copy_on_write = True
six_month_df['RDATE'] = pd.to_datetime(six_month_df['RDATE'], format='%Y-%m', errors='coerce')
six_month_df['RDATE'].value_counts(dropna=False)

RDATE
1995-11-01    320
1995-10-01    311
1995-12-01    299
1996-02-01    276
1996-01-01    274
1996-03-01    263
1995-03-01    261
1994-12-01    260
1994-03-01    258
1995-05-01    256
1994-11-01    254
1995-04-01    253
1995-08-01    251
1995-09-01    251
1994-05-01    249
1994-10-01    246
1995-01-01    242
1994-06-01    241
1994-04-01    238
1994-09-01    238
1995-07-01    232
1995-02-01    231
1994-01-01    230
1994-07-01    228
1994-02-01    226
1994-08-01    226
1996-05-01    219
1996-04-01    218
1995-06-01    211
1993-11-01    206
1993-12-01    203
1993-10-01    182
1993-09-01    169
1993-08-01    160
1993-07-01    146
1993-06-01    144
1993-05-01    142
1993-04-01    123
1993-03-01     87
Name: count, dtype: int64

In [269]:
# Update columns_to_encode variable
#columns_to_encode = six_month_df.columns[six_month_df.dtypes == 'object']
columns_to_encode

Index(['SEX', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RHEP24', 'RCONSC',
       'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7', 'RDEF8',
       'STYPE', 'RDATE', 'RXASP', 'DASP14', 'DASPLT', 'DAP', 'DOAC', 'DGORM',
       'DSTER', 'DCAA', 'DMAJNCH', 'DDIAGISC', 'DDIAGHA', 'DDIAGUN', 'DNOSTRK',
       'DRSISC', 'DRSH', 'DRSUNK', 'DPE'],
      dtype='object')

In [270]:
# Create an instance of the label encoder
le = LabelEncoder()

# Fit and transform the label encoder for each column
for column in columns_to_encode:  
    six_month_df[column] = le.fit_transform(six_month_df[column])

six_month_df.head()

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DDIAGISC,DDIAGHA,DDIAGUN,DNOSTRK,DRSISC,DRSH,DRSUNK,DPE,CNTRYNUM,OCCODE
1,0,73,1,0,1,0,0,120,1,2,...,1,0,0,0,0,0,0,0,27,1
2,1,74,0,0,1,1,0,160,1,2,...,1,0,0,0,0,0,0,0,14,2
4,1,54,0,0,1,0,0,160,1,2,...,1,0,0,0,0,0,0,0,27,2
5,0,79,0,0,0,0,0,175,0,2,...,0,1,0,0,0,0,0,0,27,1
6,1,80,0,1,1,1,0,200,1,2,...,1,0,0,0,0,0,0,0,14,2


In [271]:
# Check if there are any columns left to encode
six_month_df.columns[six_month_df.dtypes == 'object']

Index([], dtype='object')

In [272]:
six_month_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8824 entries, 1 to 17975
Data columns (total 39 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEX       8824 non-null   int32  
 1   AGE       8824 non-null   int64  
 2   RSLEEP    8824 non-null   int32  
 3   RATRIAL   8824 non-null   int32  
 4   RCT       8824 non-null   int32  
 5   RVISINF   8824 non-null   int32  
 6   RHEP24    8824 non-null   int32  
 7   RSBP      8824 non-null   int64  
 8   RCONSC    8824 non-null   int32  
 9   RDEF1     8824 non-null   int32  
 10  RDEF2     8824 non-null   int32  
 11  RDEF3     8824 non-null   int32  
 12  RDEF4     8824 non-null   int32  
 13  RDEF5     8824 non-null   int32  
 14  RDEF6     8824 non-null   int32  
 15  RDEF7     8824 non-null   int32  
 16  RDEF8     8824 non-null   int32  
 17  STYPE     8824 non-null   int32  
 18  RDATE     8824 non-null   int64  
 19  RXASP     8824 non-null   int32  
 20  DASP14    8824 non-null   int32  


In [273]:
# Create csv files for the six month data
six_month_df.to_csv('Resources/six_month_data.csv', index=False)

In [274]:
# Create dataFrame of doctor's predicted probabilities
doctor_pred_probs_df = stroke_trials_remove_hep_df[['EXPDD','EXPD6']].copy()
doctor_pred_probs_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8824 entries, 1 to 17975
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   EXPDD   8824 non-null   float64
 1   EXPD6   8824 non-null   float64
dtypes: float64(2)
memory usage: 206.8 KB


In [275]:
# Create csv file for doctor's predicted probabilities
doctor_pred_probs_df.to_csv('Resources/doctor_6month_pred.csv', index=False)