In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn import linear_model
from sklearn.metrics import f1_score

Load data to pandas dataframe.

In [2]:
patients = pd.read_csv('../../mimic3/data/PATIENTS.csv')
admissions = pd.read_csv('../../mimic3/data/ADMISSIONS.csv')

Discard naccessary features from admission data.

In [3]:
admissions = admissions.drop(['ROW_ID','HADM_ID','DEATHTIME','ADMISSION_LOCATION',
                              'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
                              'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
                               'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA'], axis=1)

Transform string data in admission data to date.
Generate a new column 'STAY_DAYS' in admission data.

In [4]:
admissions.DISCHTIME = pd.to_datetime(admissions.DISCHTIME, format = '%Y-%m-%d %H:%M:%S')
admissions['ADMITTIME'] = pd.to_datetime(admissions.ADMITTIME, format = '%Y-%m-%d %H:%M:%S')
admissions['STAY_DAYS'] = (admissions['DISCHTIME'].dt.date - admissions['ADMITTIME'].dt.date).dt.days + 1

Order admission data and reset index for it.

In [5]:
admissions = admissions.sort_values(by=['SUBJECT_ID','ADMITTIME'],ascending=[1,1])
admissions = admissions.reset_index(drop=True)

In [6]:
admissions.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,STAY_DAYS
0,2,2138-07-17 19:04:00,2138-07-21 15:48:00,NEWBORN,5
1,3,2101-10-20 19:08:00,2101-10-31 13:58:00,EMERGENCY,12
2,4,2191-03-16 00:28:00,2191-03-23 18:41:00,EMERGENCY,8
3,5,2103-02-02 04:31:00,2103-02-04 12:15:00,NEWBORN,3
4,6,2175-05-30 07:15:00,2175-06-15 16:00:00,ELECTIVE,17


Generate the gap days from two sucessive admissions for the same patient.

In [7]:
patient_journey = admissions

Generate the gap days between the first admission and others for the same patient.

In [8]:
gaps = [0]
samePat = patient_journey.loc[0]
for i in range(patient_journey.shape[0]-1):
    row2 = patient_journey.loc[i+1]
    if(samePat.SUBJECT_ID == row2.SUBJECT_ID):
        gap = (row2.ADMITTIME - samePat.ADMITTIME).days
    else:
        gap = 0
        samePat = row2
    gaps.append(gap)

In [9]:
patient_journey['gap_first'] = gaps

Get frequent patients

In [10]:
freq_pats = patient_journey[(patient_journey.gap_first>0) & (patient_journey.gap_first<=730)]
unique_pats = freq_pats.SUBJECT_ID.unique()

In [12]:
pj_2years = patient_journey[patient_journey.SUBJECT_ID.isin(unique_pats)]

In [13]:
pj_2years.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,STAY_DAYS,gap_first
13,17,2134-12-27 07:15:00,2134-12-31 16:05:00,ELECTIVE,5,0
14,17,2135-05-09 14:11:00,2135-05-13 14:40:00,EMERGENCY,5,133
18,21,2134-09-11 12:17:00,2134-09-24 16:15:00,EMERGENCY,14,0
19,21,2135-01-30 20:50:00,2135-02-08 02:08:00,EMERGENCY,10,141
35,36,2131-04-30 07:15:00,2131-05-08 14:00:00,EMERGENCY,9,0


In [14]:
#Next step: time series encoding 
pj_2years = pj_2years[pj_2years.gap_first<730]
pj_2years = pj_2years.sort_values(by=['SUBJECT_ID','ADMITTIME'],ascending=[1,1])
pj_2years = pj_2years.reset_index(drop=True)

In [16]:
pj_2years.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,STAY_DAYS,gap_first
0,17,2134-12-27 07:15:00,2134-12-31 16:05:00,ELECTIVE,5,0
1,17,2135-05-09 14:11:00,2135-05-13 14:40:00,EMERGENCY,5,133
2,21,2134-09-11 12:17:00,2134-09-24 16:15:00,EMERGENCY,14,0
3,21,2135-01-30 20:50:00,2135-02-08 02:08:00,EMERGENCY,10,141
4,36,2131-04-30 07:15:00,2131-05-08 14:00:00,EMERGENCY,9,0


In [17]:
pats_2years = pj_2years.SUBJECT_ID.unique()
pat_num = len(pats_2years)

In [18]:
grouped = pj_2years.groupby(['SUBJECT_ID'])
matrix = np.zeros((pat_num,730))
for indx, pat in enumerate(pats_2years):
    pat_acts = grouped.get_group(pat).sort_values(by=['ADMITTIME'])
    for index, row in pat_acts.iterrows():
        start = row['gap_first']
        length = row['STAY_DAYS']
        matrix[indx,start:start+length] = 1

In [20]:
matrix.shape

(5784, 730)

In [22]:
#Next step: clustering based on time series encoding 
from sklearn.cluster import KMeans
n_clusters = 3
kmeanModel = KMeans(n_clusters=n_clusters,random_state=42).fit(matrix)

In [23]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Y_sklearn = pca.fit_transform(matrix)

In [24]:
pca_data = pd.DataFrame(Y_sklearn,columns=['x','y'])
pca_data['label'] = kmeanModel.labels_

In [25]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)

traces = []

for cluster in range(n_clusters):
    data = pca_data[pca_data.label==cluster]
    trace = Scatter(
        x=data.x,
        y=data.y,
        mode='markers',
        name=cluster,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
    traces.append(trace)

data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
                yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
iplot(fig)

In [27]:
import plotly.plotly as py
import plotly.graph_objs as go


data = [go.Scatter( x=list(range(730)), y=matrix[110] )]
iplot(data, filename='pandas-time-series')