In [1]:
import pandas as pd
import numpy as np

## Reading in Data

In [2]:
event_feature_map = pd.read_csv('~/git/GeorgiaTech/cse6250/903178639-vla6-hw1/data/train/event_feature_map.csv')
events = pd.read_csv('~/git/GeorgiaTech/cse6250/903178639-vla6-hw1/data/train/events.csv')
mortality = pd.read_csv('~/git/GeorgiaTech/cse6250/903178639-vla6-hw1/data/train/mortality_events.csv')

## Exploring Data

In [3]:
events.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0


In [4]:
events.shape

(740066, 5)

In [5]:
event_feature_map.head()

Unnamed: 0,idx,event_id
0,1,DIAG132397
1,2,DIAG132408
2,3,DIAG132446
3,4,DIAG132583
4,5,DIAG132643


In [6]:
mortality.head()

Unnamed: 0,patient_id,timestamp,label
0,19,2014-03-04,1
1,12,2011-12-19,1
2,41,2014-02-15,1
3,106,2015-08-11,1
4,112,2011-04-23,1


In [7]:
mortality.shape

(500, 3)

In [8]:
mortality.label.value_counts()

1    500
Name: label, dtype: int64

In [9]:
event_feature_map.idx.value_counts().value_counts()

1    3188
Name: idx, dtype: int64

## Descriptive Stats

In [10]:
df = events.merge(mortality[['patient_id', 'label']], how='left', on='patient_id')
df.rename(index=str, columns={'label': 'is_deceased'}, inplace=True)
df.is_deceased.fillna(0, inplace=True)
df.is_deceased = df.is_deceased.astype(bool)

In [11]:
df.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value,is_deceased
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0,True
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0,True
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0,True
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0,True
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0,True


In [12]:
df.is_deceased.value_counts()

True     491007
False    249059
Name: is_deceased, dtype: int64

In [13]:
df.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value,is_deceased
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0,True
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0,True
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0,True
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0,True
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0,True


### Event Counts by Deceased vs Alive Patients

In [14]:
event_counts = df.groupby(['patient_id', 'is_deceased']).size().reset_index(name='number_events')

In [15]:
event_counts.head()

Unnamed: 0,patient_id,is_deceased,number_events
0,12,True,868
1,19,True,177
2,41,True,1092
3,80,False,185
4,99,False,238


**Answer: Event Counts by Deceased Status**

In [16]:
event_counts_results = event_counts.groupby(['is_deceased']).agg(['mean', 'min', 'max'])['number_events']

In [17]:
event_counts_results

Unnamed: 0_level_0,mean,min,max
is_deceased,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,498.118,1,12627
True,982.014,1,8635


In [18]:
avg_dead_event_count = event_counts_results.loc[True]['mean']
max_dead_event_count = event_counts_results.loc[True]['max']
min_dead_event_count = event_counts_results.loc[True]['min']
avg_alive_event_count = event_counts_results.loc[False]['mean']
max_alive_event_count = event_counts_results.loc[False]['max']
min_alive_event_count = event_counts_results.loc[False]['min']

In [19]:
avg_dead_event_count, max_dead_event_count, min_dead_event_count, avg_alive_event_count, max_alive_event_count, min_alive_event_count

(982.014, 8635.0, 1.0, 498.118, 12627.0, 1.0)

### Encounters by Deceased vs Alive Patients

In [20]:
events.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0


In [21]:
encounters = events[['patient_id', 'timestamp']].drop_duplicates()
encounters.sort_values(['patient_id', 'timestamp'], inplace=True)
encounters['encounter_id'] = [i for i in range(encounters.shape[0])]
encounters.set_index('encounter_id', inplace=True)

In [22]:
encounters.head()

Unnamed: 0_level_0,patient_id,timestamp
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12,2011-12-04
1,12,2011-12-06
2,12,2011-12-07
3,12,2011-12-08
4,12,2011-12-09


In [23]:
## Counting number of encounters and splitting by deceased status
encounter_counts = encounters.groupby(['patient_id']).size().reset_index(name='number_encounters')
encounter_counts = encounter_counts.merge(mortality[['patient_id', 'label']], how='left', on='patient_id')
encounter_counts.rename(index=str, columns={'label': 'is_deceased'}, inplace=True)
encounter_counts.is_deceased.fillna(0, inplace=True)
encounter_counts.is_deceased = encounter_counts.is_deceased.astype(bool)

**Answer: Encounter Counts by deceased patients**

In [24]:
encounter_counts_results = encounter_counts.groupby(['is_deceased']).agg(['mean', 'max', 'min'])['number_encounters']

In [25]:
encounter_counts_results

Unnamed: 0_level_0,mean,max,min
is_deceased,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,15.452,391,1
True,23.038,203,1


In [26]:
avg_dead_encounter_count = encounter_counts_results.loc[True]['mean']
max_dead_encounter_count = encounter_counts_results.loc[True]['max']
min_dead_encounter_count = encounter_counts_results.loc[True]['min']
avg_alive_encounter_count = encounter_counts_results.loc[False]['mean']
max_alive_encounter_count = encounter_counts_results.loc[False]['max']
min_alive_encounter_count = encounter_counts_results.loc[False]['min']

## Record Length by Deceased Status

In [27]:
events.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0


In [28]:
record_length = events[['patient_id', 'timestamp']].groupby('patient_id').agg(['min', 'max'])
record_length.columns = ['min_timestamp', 'max_timestamp']
record_length['record_length'] = (pd.to_datetime(record_length.max_timestamp) - pd.to_datetime(record_length.min_timestamp)).dt.days
record_length.reset_index(inplace=True)

In [29]:
record_length.head()

Unnamed: 0,patient_id,min_timestamp,max_timestamp,record_length
0,12,2011-12-04,2011-12-19,15
1,19,2013-02-19,2013-02-23,4
2,41,2013-05-13,2013-06-25,43
3,80,2015-01-19,2015-01-27,8
4,99,2010-04-25,2013-10-13,1267


In [30]:
## Splitting by deceased status
record_length = record_length.merge(mortality[['patient_id', 'label']], how='left', on='patient_id')
record_length.rename(index=str, columns={'label': 'is_deceased'}, inplace=True)
record_length.is_deceased.fillna(0, inplace=True)
record_length.is_deceased = record_length.is_deceased.astype(bool)
record_length.record_length = record_length.record_length.astype(int)

In [31]:
record_length.head()

Unnamed: 0,patient_id,min_timestamp,max_timestamp,record_length,is_deceased
0,12,2011-12-04,2011-12-19,15,True
1,19,2013-02-19,2013-02-23,4,True
2,41,2013-05-13,2013-06-25,43,True
3,80,2015-01-19,2015-01-27,8,False
4,99,2010-04-25,2013-10-13,1267,False


**Answer: Record Length by Deceased Status**

In [32]:
record_length_results = record_length.groupby(['is_deceased']).agg(['mean', 'max', 'min'])['record_length']

In [33]:
record_length_results

Unnamed: 0_level_0,mean,max,min
is_deceased,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,159.2,2914,0
True,127.532,1972,0


In [35]:
avg_dead_rec_len = record_length_results.loc[True]['mean']
max_dead_rec_len = record_length_results.loc[True]['max']
min_dead_rec_len = record_length_results.loc[True]['min']
avg_alive_rec_len = record_length_results.loc[False]['mean']
max_alive_rec_len = record_length_results.loc[False]['max']
min_alive_rec_len = record_length_results.loc[False]['min']

## Calculating Index Date

In [36]:
events.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0


In [67]:
indx_date = events[['patient_id', 'timestamp']].groupby(['patient_id'])\
                                               .agg({'timestamp': 'max'})\
                                               .rename(columns={'timestamp': 'last_event_date'})\
                                               .reset_index()

In [68]:
indx_date = indx_date.merge(mortality[['patient_id', 'label', 'timestamp']].rename(columns={'timestamp': 'date_of_death'}), how='left', on='patient_id')
indx_date.rename(index=str, columns={'label': 'is_deceased'}, inplace=True)
indx_date.is_deceased.fillna(0, inplace=True)
indx_date.is_deceased = indx_date.is_deceased.astype(bool)
indx_date.last_event_date = pd.to_datetime(indx_date.last_event_date)
indx_date.date_of_death = pd.to_datetime(indx_date.date_of_death)
indx_date['date_of_death_30_days_prior'] = indx_date.date_of_death - pd.to_timedelta(30, unit='d')
indx_date['indx_date'] = indx_date.date_of_death_30_days_prior.combine_first(indx_date.last_event_date)
indx_date = indx_date[['patient_id', 'indx_date']]

In [69]:
indx_date.head()

Unnamed: 0,patient_id,indx_date
0,12,2011-11-19
1,19,2014-02-02
2,41,2014-01-16
3,80,2015-01-27
4,99,2013-10-13


In [73]:
indx_date.patient_id.value_counts().value_counts()

1    1000
Name: patient_id, dtype: int64