### Sometimes the modeling table didn't record a parameter for some reason. Scripted this to fix a column that didn't get recorded correctly. (in this case, the smoker column)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as datetime
from impala.util import as_pandas
import cPickle as pickle
%matplotlib notebook
plt.style.use('ggplot')

In [2]:
from impala.dbapi import connect
conn = connect(host="socbddn01.sharp.com", port=21050)
cur = conn.cursor()
cur.execute("use poc_sandbox")

In [3]:
def fix_smoker_to_table(masterdf, i):
    '''
    input: df with data, includes the extra columns for id
    output: table with column fixed
    '''

    enc = masterdf.encntr_id.iloc[i]
    timeend = masterdf.timeend.iloc[i]
    
    query = '''
    SELECT ce.encntr_id, ce.event_cd 
    ,ce.event_end_dt_tm AS unix_event_end_tm 
    , ce.result_val, ce.result_units_cd
    FROM clinical_event ce 
    JOIN encounter enc ON enc.encntr_id = ce.encntr_id 
    WHERE ce.encntr_id = '{0}' 
    AND ce.event_end_dt_tm < {1} 
    AND ce.result_status_cd NOT IN ('31', '36')
    AND ce.event_class_cd NOT IN ('654645')
    AND ce.valid_until_dt_tm > 4e12
    AND ce.event_cd IN ('75144985') 
    ORDER BY ce.encntr_id, ce.performed_dt_tm;
    '''.format(enc, timeend)

    cur.execute(query)
    df = as_pandas(cur)
    
    if (df[df.event_cd=='75144985']).empty:
        masterdf.ix[i, 'smoker'] = 0
    elif ( (df[df.event_cd=='75144985'].result_val.get_values()[0] == 'Heavy tobacco smoker') | 
    (df[df.event_cd=='75144985'].result_val.get_values()[0] == 'Light tobacco smoker') |
    (df[df.event_cd=='75144985'].result_val.get_values()[0] == 'Current every day smoker') |
    (df[df.event_cd=='75144985'].result_val.get_values()[0] == 'Current some day smoker') |
    (df[df.event_cd=='75144985'].result_val.get_values()[0] == 'Smoker, current status unknown')  ):
        masterdf.ix[i, 'smoker'] = 1
    else: 
        masterdf.ix[i, 'smoker'] = 0
        
    return masterdf

In [3]:
filename = 'RRT_modeling_table_13hr_raw.p'
masterdf_rrt = pickle.load(open(filename, 'rb'))

In [5]:
masterdf_rrt.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
event_end_dt_tm,2063.0,1445053000000.0,14785120000.0,1410347000000.0,1431889000000.0,1445573000000.0,1457776000000.0,1471205000000.0
timestart,2063.0,1445006000000.0,14785120000.0,1410300000000.0,1431842000000.0,1445526000000.0,1457729000000.0,1471158000000.0
timeend,2063.0,1445050000000.0,14785120000.0,1410343000000.0,1431885000000.0,1445570000000.0,1457772000000.0,1471201000000.0
age,2063.0,67.41154,16.60477,18.0,58.0,69.0,80.0,101.0
obese,1521.0,0.2971729,0.4571636,0.0,0.0,0.0,1.0,1.0
smoker,2063.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
prev_rrt,2063.0,0.1536597,0.3607097,0.0,0.0,0.0,0.0,1.0
on_iv,2063.0,0.7280659,0.445064,0.0,0.0,1.0,1.0,1.0
bu-nal,2063.0,0.03005332,0.1707755,0.0,0.0,0.0,0.0,1.0
DBP_mean,1950.0,70.63889,11.65375,37.91667,62.0,70.0,78.66667,122.3125


In [None]:
count = 0
for i in xrange(len(masterdf_rrt)):
# for i in xrange(27, len(masterdf_rrt), 1):
# for i in range(3):
    count +=1
    print "count {0} of {1}".format(count, len(masterdf_rrt))
    masterdf_rrt = fix_smoker_to_table(masterdf_rrt, i)
masterdf_rrt.to_pickle(filename[:-2] + '_rev1.p')

count 1 of 2063
count 2 of 2063
count 3 of 2063
count 4 of 2063


In [20]:
masterdf_rrt.head()

Unnamed: 0,rrt_ce_id,encntr_id,event_end_dt_tm,timestart,timeend,age,sex,obese,smoker,prev_rrt,...,GCS_mean,GCS_recent,anticoagulants,narcotics,narc-ans,antipsychotics,chemo,dialysis,race,rrt_reason
0,6758153566,100009623,1441447620000,1441400820000,1441444020000,67,M,,0.0,0.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,White,Arrythmias
1,6802898656,100022347,1442595780000,1442548980000,1442592180000,68,F,0.0,1.0,0.0,...,,,1.0,1.0,0.0,1.0,0.0,0.0,White,"Staff concerned/Worried about Patient, Tachyca..."
2,6820399231,100022347,1443022200000,1442975400000,1443018600000,68,F,0.0,1.0,1.0,...,,,1.0,1.0,1.0,1.0,0.0,0.0,White,"Hypotension, Tachycardia"
3,6764808869,100031243,1441621320000,1441574520000,1441617720000,81,M,0.0,0.0,0.0,...,15.0,15.0,0.0,1.0,0.0,0.0,0.0,0.0,White,Suspected Stroke/Acute
4,6761972438,100031586,1441548900000,1441502100000,1441545300000,68,F,1.0,0.0,0.0,...,12.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,Other Race,Changes in LOC
