In [50]:
from pyspark.sql.functions import col,udf,collect_list

from pyspark.ml.feature import IDF, CountVectorizer,StringIndexer
from pyspark.ml.clustering import LDA
from pyspark.ml.linalg import SparseVector, VectorUDT,DenseVector

import numpy as np

## 1.Load data to DataFrame

In [176]:
dias_df = spark.read.csv("../../mimic3/data/DIAGNOSES_ICD.csv", header=True, mode="DROPMALFORMED")
dic_ICD_df = spark.read.csv("../../mimic3/data/D_ICD_DIAGNOSES.csv", header=True, mode="DROPMALFORMED")

## 2.Data Pre-processing 

In [3]:
dias_df.createOrReplaceTempView("diagnosis")

#Fitering the unspecified disgnosis codes
filteredDiags = spark.sql("SELECT SUBJECT_ID,ICD9_CODE FROM diagnosis WHERE ICD9_CODE not in ('', '4019','7793','2724','2449')")

### 2.1 Extract frequent diagnosis codes

In [4]:
#Choosing top 500 frequent diagnosis codes.
topDiags = filteredDiags.groupBy("ICD9_CODE").count().sort(col("count").desc()).select("ICD9_CODE").limit(500)

### 2.2 Extract patients who had the requent diagnosis codes

In [5]:
#Inner join to get patients who had the top 500 diagnosis codes.
top_freq_pats = filteredDiags.join(topDiags, filteredDiags.ICD9_CODE == topDiags.ICD9_CODE, "inner").\
                drop(topDiags.ICD9_CODE)

### 2.3 Aggregate diagnosis codes list through grouping by "SUBJECT_ID"

In [6]:
pats_dias = top_freq_pats.groupBy("SUBJECT_ID").agg(collect_list("ICD9_CODE"))
pats_dias = pats_dias.select(col("SUBJECT_ID"),col("collect_list(ICD9_CODE)").alias("codes"))

### 2.4 Encode patients and diagnosis codes

In [22]:
#index subject_id to label
indexer = StringIndexer(inputCol="SUBJECT_ID", outputCol="label")
indexed = indexer.fit(pats_dias).transform(pats_dias)

#terms' count vector
vector = CountVectorizer(inputCol="codes", outputCol="tf_features")
countVect = vector.fit(indexed)  

#get vocaulary
vocabs = countVect.vocabulary

### 2.5 Frequency of diagnosis codes

In [23]:
freqVect = countVect.transform(indexed)

In [24]:
freqVect.printSchema()

root
 |-- SUBJECT_ID: string (nullable = true)
 |-- codes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: double (nullable = true)
 |-- tf_features: vector (nullable = true)



### 2.6 TF-IDF of diagnosis codes

In [25]:
# dataset = freqVect.select(col("label"),col("features").alias("rawFeatures"))
idf = IDF(inputCol="tf_features", outputCol="tfidf_features")
idfModel = idf.fit(freqVect)
tf_idfVect = idfModel.transform(freqVect)

In [26]:
# tf_idfVect = tf_idfVect.selectExpr("features as TFIDF_features")
tf_idfVect.printSchema()

root
 |-- SUBJECT_ID: string (nullable = true)
 |-- codes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: double (nullable = true)
 |-- tf_features: vector (nullable = true)
 |-- tfidf_features: vector (nullable = true)



### 2.7 OneHot of diagnosis codes

In [31]:
def to_sparse(c):
    def to_sparse_(v):
#         if isinstance(v, SparseVector):
#             return v
        vs = (v.toArray()>0)*1
        nonzero = np.nonzero(vs)[0]
        return SparseVector(v.size, nonzero, vs[nonzero])
    return udf(to_sparse_, VectorUDT())(c)

oneHot_freqVect = tf_idfVect.withColumn("oneHot_features",to_sparse(tf_idfVect.tf_features))

### 2.8 AutoEncoders of diagnosis codes

In [35]:
from keras.layers import Input, Dense
from keras.models import Model

# this is the size of our encoded representations
encoding_dim = 32 

# this is our input placeholder
input_patient = Input(shape=(500,))
# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_patient)
# "decoded" is the lossy reconstruction of the input
decoded = Dense(500, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input_patient, decoded)

Using TensorFlow backend.


In [152]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

#### Converting Spark ML Vector to Numpy Array

In [98]:
#Converting to Panda’s dataframe
array_features = oneHot_freqVect.select('oneHot_features').toPandas()

In [154]:
#Convert Sparse Vector to Matrix
dataset = array_features['oneHot_features'].apply(lambda x : x.toArray()).as_matrix().reshape(-1,1)

#Flatten using apply_along_axis
features = np.apply_along_axis(lambda x : x[0], 1, dataset)

In [157]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(features, test_size=0.20, random_state=42)

In [158]:
autoencoder.fit(X_train, X_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(X_test, X_test))

Train on 36904 samples, validate on 9227 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2b369333f850>

In [161]:
# this model maps an input to its encoded representation
encoder = Model(input_patient, encoded)

# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

In [164]:
# encode and decode some digits
# note that we take them from the *test* set
encoded_patients = encoder.predict(X_test)
# decoded_patients = decoder.predict(encoded_patients)

In [167]:
encoded_patient.shape

(9227, 32)

In [168]:
oneHot_freqVect.printSchema()

root
 |-- SUBJECT_ID: string (nullable = true)
 |-- codes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: double (nullable = true)
 |-- tf_features: vector (nullable = true)
 |-- tfidf_features: vector (nullable = true)
 |-- oneHot_features: vector (nullable = true)



### 2.9 Topic Model of diagnosis codes

In [169]:
dataset = oneHot_freqVect.select(col("label"),col("tf_features").alias("features"))

#train LDA model
lda_mimic = LDA(k=10, maxIter=10)
model_mimic = lda_mimic.fit(dataset)

In [170]:
# Describe topics.
tf_topics = model_mimic.describeTopics(10)

In [172]:
#get distribution matrix of documents to topics
docTopics = model_mimic.transform(dataset)

In [175]:
docTopics.show(truncate=False)

+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label  |features                                                                                                                                                                                                                                                 |topicDistribution                                                                                                                                                                                                  |
+-------+-----------------------------------------------