In [None]:
import pymysql
import pandas as pd
import getpass
from textblob import TextBlob

import re

In [None]:
import scipy

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
from IPython.display import display, HTML

In [None]:
from sqlalchemy import create_engine

In [None]:
engine = create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'
                      .format('jovyan', 
                             getpass.getpass('Enter MySQL password for jovyan'),
                             'mysql',
                             '3306',
                             'mimic2'))

In [None]:
display(pd.read_sql_query('SELECT count(*) as PatientCount from d_patients', engine))
display(pd.read_sql_query('SELECT count(*) as AdmissionCount from admissions', engine))
display(pd.read_sql_query('SELECT count(*) as NoteCount from noteevents', engine))

In [None]:
display(pd.read_sql_query('SELECT * from admissions LIMIT 5', engine))

In [None]:
display(pd.read_sql_query('SELECT * from icd9 LIMIT 5', engine))

In [None]:
display(pd.read_sql_query('SELECT * from noteevents LIMIT 5', engine))

In [None]:
# before pulling note text, let's get a distribution of how many RADIOLOGY reports
# typically exist per admission
visit_rad_report_count_query = """
SELECT
n.hadm_id
,count(*) as rad_note_count
FROM d_patients p
INNER JOIN noteevents n
    ON n.subject_id = p.subject_id
WHERE 
    Category = 'RADIOLOGY_REPORT' 
    AND (text like '%%CHEST (PORTABLE AP)%%' OR text like '%%CHEST (PA & LAT)%%')
    AND n.hadm_id IS NOT NULL
GROUP BY n.hadm_id
ORDER BY count(*) DESC
"""
visit_rad_report_count_df = pd.read_sql_query(visit_rad_report_count_query, engine)
display(visit_rad_report_count_df.head(10))

In [None]:
rad_note_counts = visit_rad_report_count_df['rad_note_count'].values
scipy.stats.describe(rad_note_counts)

In [None]:
rad_note_count_grouping = visit_rad_report_count_df.groupby('rad_note_count').size()
#display(rad_note_count_grouping)

note_count_bins = rad_note_count_grouping.index.values
#print(note_count_bins)
note_frequencies = rad_note_count_grouping.values
#print(note_frequencies)

fig = plt.figure(figsize=(16, 8))

plt.xlabel('Total Radiology Chest X-Ray Notes per visit')
plt.ylabel('Total Visits')

plt.bar(note_count_bins, note_frequencies)

In [None]:
# now let's pull a frame of all the FIRST (sorted by text which begins with date) CHEST X-RAY notes
chest_xray_note_query = """
SELECT
subject_id
,hadm_id
,LTRIM(RTRIM(text)) as text
FROM noteevents
WHERE category = 'RADIOLOGY_REPORT'
    AND (text like '%%CHEST (PORTABLE AP)%%' OR text like '%%CHEST (PA & LAT)%%')
    AND subject_id is not NULL
    AND hadm_id is not NULL
"""
chest_xray_note_df = pd.read_sql_query(chest_xray_note_query, engine)
display(chest_xray_note_df)

# NOTE : It be useful to use these chest x-ray radiology reports to get an idea of some of the language in these reports
## For example, it could be useful to do a word count across this dataset
## It could also be interesting to do a count of most frequent n-grams from this set

# This may be worth coming back to later as we we our group projects