In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [2]:
# read our data into a dataframe and have a look at the dataframe
df = pd.read_csv('clinical_events.csv')
df

Unnamed: 0.1,Unnamed: 0,clinical_event_id,clinical_event_member_id,clinical_event_name,clinical_event_time,clinical_event_member_age,member_male
0,1,1.0,1,ANNUAL VISIT,41,81,1
1,2,2.0,1,STROKE HOSPITALIZATION,641,81,1
2,3,3.0,1,HYPERTENSION MANAGEMENT VISIT,116,81,1
3,4,4.0,1,HYPERTENSION MANAGEMENT VISIT,293,81,1
4,5,5.0,2,ANNUAL VISIT,14,76,1
...,...,...,...,...,...,...,...
511841,511842,511842.0,199998,HEART FAILURE HOSPITALIZATION,29,78,1
511842,511843,511843.0,199998,HEART FAILURE HOSPITALIZATION,275,78,1
511843,511844,511844.0,199998,HEART FAILURE HOSPITALIZATION,278,78,1
511844,511845,511845.0,199999,ANNUAL VISIT,134,68,1


In [3]:
# see the data basic information like weather there are any missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511846 entries, 0 to 511845
Data columns (total 7 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 511846 non-null  int64  
 1   clinical_event_id          511846 non-null  float64
 2   clinical_event_member_id   511846 non-null  int64  
 3   clinical_event_name        511846 non-null  object 
 4   clinical_event_time        511846 non-null  int64  
 5   clinical_event_member_age  511846 non-null  int64  
 6   member_male                511846 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 27.3+ MB


In [4]:
# see the basic statistics of our data
df.describe()
#We find that all the members all male, so member_male will not influence the analysis result

Unnamed: 0.1,Unnamed: 0,clinical_event_id,clinical_event_member_id,clinical_event_time,clinical_event_member_age,member_male
count,511846.0,511846.0,511846.0,511846.0,511846.0,511846.0
mean,255923.5,255923.5,100045.459302,188.899161,73.197186,1.0
std,147757.357279,147757.357279,57797.624855,160.707576,7.211483,0.0
min,1.0,1.0,1.0,0.0,60.0,1.0
25%,127962.25,127962.25,49939.25,64.0,67.0,1.0
50%,255923.5,255923.5,100122.0,129.0,74.0,1.0
75%,383884.75,383884.75,150001.0,296.0,80.0,1.0
max,511846.0,511846.0,200000.0,730.0,85.0,1.0


In [5]:
# We add a column called "label" into our dataframe
# We want to predict 'STROKE HOSPITALIZATION' so only this is 1, the others are 0
df['label'] = df['clinical_event_name'].apply(lambda e: 1 if e == 'STROKE HOSPITALIZATION' else 0)
df[['label']].value_counts() 

label
0        487025
1         24821
dtype: int64

In [6]:
# split our data into training data, training label, test data and test label
train_data, test_data, train_label, test_label = train_test_split(df[['clinical_event_time', 'clinical_event_member_age']], df['label'], test_size=50000, random_state=42)

In [7]:
# Method 1: use decision tree to classify the data
clf1 = tree.DecisionTreeClassifier()
clf1 = clf1.fit(train_data, train_label)

In [8]:
predict1 = clf1.predict(test_data)

In [9]:
# Here is the confusion matrix of decision prediction result, we can see that the accuracy is 100%
confusion_matrix(test_label, predict1)

array([[47630,     0],
       [    0,  2370]])

In [10]:
#Method 2: use MLP to classify the data
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(train_data)
X_test_maxabs = max_abs_scaler.fit_transform(test_data)

In [11]:
clf2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf2.fit(X_train_maxabs, train_label)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
              solver='lbfgs')

In [12]:
predict2 = clf2.predict(X_test_maxabs)

In [13]:
# Here is the confusion matrix of MLP result, we can see that the accuracy is 100%
confusion_matrix(test_label, predict2)

array([[47630,     0],
       [    0,  2370]])

Let's see which data attributes are predominately influencing stroke risk
For one clinical member, the age is the same, but can have multiple clinical_event_names
So my assumption is that clinical_event_time is the most important attribute
Let's use only clinical_event_time to do the prediction again

In [14]:
# We can see from the result, the accuracy is still 100%, so my assumption is correct
# clinical_event_time is the most important attribute
train_data, test_data, train_label, test_label = train_test_split(df[['clinical_event_time']], df['label'], test_size=50000, random_state=42)
clf = tree.DecisionTreeClassifier()
clf = clf1.fit(train_data, train_label)
predict = clf.predict(test_data)
confusion_matrix(test_label, predict)

array([[47630,     0],
       [    0,  2370]])

In [15]:
# Let's try to use only clinical_event_member_age
# We can see it predict all data with the label of True
# So clinical_event_member_age is not an useful attribute to do the classification
train_data, test_data, train_label, test_label = train_test_split(df[['clinical_event_member_age']], df['label'], test_size=50000, random_state=42)
clf = tree.DecisionTreeClassifier()
clf = clf1.fit(train_data, train_label)
predict = clf.predict(test_data)
confusion_matrix(test_label, predict)

array([[47630,     0],
       [ 2370,     0]])