In [39]:
# Libraries
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)
import json, snowflake.connector

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Remove printing error
pd.options.mode.chained_assignment = None

In [None]:
# establish the connection to snowflake
ctx = snowflake.connector.connect( 
    **json.load(open('/opt/ich/python-snowflake-defaults.json')))
    
# verify and test if connection is working
try: 
    cs = ctx.cursor() 
    cs.execute('SELECT current_version(), current_role(), current_warehouse()')
    print(cs.fetchone())
finally: 
    cs.close()

In [13]:
# Import 
query = '''
SELECT * from ICHT_PROD.ICHT_COVID.PROBLEM_2023
'''
cur = ctx.cursor().execute(query)
problem = pd.DataFrame.from_records(iter(cur), columns=[x[0] for x in cur.description])

In [2]:
# Import
path = r'switch_data/chronic_switch_icare_df_preprocessed_2023.csv'
icare_df_preprocessed = pd.read_csv(path)

# Import
path = r'switch_data/chronic_switch_episodes_2023.csv'
episodes = pd.read_csv(path)

# Import
path = r'switch_data/chronic_switch_disease_2023.csv'
disease = pd.read_csv(path)

# Import
path = r'switch_data/chronic_switch_demographics_2023.csv'
demographics = pd.read_csv(path)

# Import
path = r'switch_data/snomed_embedding_128d-copy.csv'
embedding = pd.read_csv(path)

# Import
path = r'switch_data/chronic_switch_problem_dummies_2023?.csv'
problem_dummies = pd.read_csv(path)

In [26]:
# Get lists
problem_list = problem_dummies.iloc[:,3:].rename(columns=lambda x: x.strip('PROBLEM_')).columns.tolist()
subject_list = problem_dummies['SUBJECT'].unique().tolist()

In [54]:
# Filter
filtered_problem = problem[problem['SUBJECT'].isin(subject_list)]
filtered_problem = filtered_problem[filtered_problem['PROBLEM'].isin(problem_list)]
filtered_problem = filtered_problem[['SUBJECT',	'PROBLEM', 'PROBLEM_DT_TM']]

In [77]:
# Create new_subject column
# Convert 'date' column to datetime
filtered_problem['PROBLEM_DT_TM'] = pd.to_datetime(filtered_problem['PROBLEM_DT_TM'])

# Sort DataFrame by 'subject' and 'date'
filtered_problem = filtered_problem.sort_values(by=['SUBJECT', 'PROBLEM_DT_TM'])

# Initialize the new_subject counter
new_subject_counter = 1

# Initialize an empty list to store the resulting data
result_data = []

# Iterate over each subject group
for subject, group in filtered_problem.groupby('SUBJECT'):
    # Initialize an empty dictionary to store the mapping of dates to new_subject values
    date_to_new_subject = {}
    # Iterate over each unique date
    for date in group['PROBLEM_DT_TM'].unique():
        if date not in date_to_new_subject:
            # If the date is encountered for the first time, assign a new_subject value
            date_to_new_subject[date] = new_subject_counter
            new_subject_counter += 1
        # Filter rows with date less than or equal to the current date
        filtered_rows = group[group['PROBLEM_DT_TM'] <= date].copy()
        # Assign new_subject values
        filtered_rows['new_subject'] = date_to_new_subject[date]
        # Append to result_data
        result_data.append(filtered_rows)

# Concatenate the filtered dataframes into a new dataframe
new_filtered_problem = pd.concat(result_data)

# Reset index
new_filtered_problem.reset_index(drop=True, inplace=True)

In [85]:
new_filtered_problem.columns

Index(['SUBJECT', 'PROBLEM', 'PROBLEM_DT_TM', 'new_subject'], dtype='object')

In [92]:
final_dummies = pd.DataFrame(columns=['SUBJECT', 'PROBLEM_DT_TM', 'new_subject'])
final_dummies.to_csv('working_problem_dummies_2023.csv', index=False)

In [None]:
# Code to transform into one hot df
final_dummies = pd.DataFrame()
for n in new_filtered_problem.new_subject.unique().tolist():
    temp_df = new_filtered_problem[new_filtered_problem['new_subject'] == n]

    # Get dummies
    temp_dummies = pd.get_dummies(temp_df, columns=['PROBLEM'])

    temp_df = temp_dummies.copy()
    temp_df = pd.DataFrame(temp_dummies.iloc[-1,:3]).transpose()
    temp_df.reset_index(inplace=True, drop=True)

    temp_dummies2 = pd.DataFrame(temp_dummies.iloc[:,3:].sum()).transpose()

    temp_df2 = temp_df.join(temp_dummies2)

    final_dummies = pd.concat([final_dummies, temp_df2], ignore_index=True)

    if (n / 100).is_integer():
        print(n)
        # Load
        path = r'working_problem_dummies_2023.csv'
        working_dummies = pd.read_csv(path)
        # Concat
        working_dummies = pd.concat([working_dummies, final_dummies], ignore_index=True)
        # Save
        working_dummies.to_csv('working_problem_dummies_2023.csv', index=False)
        # Delete
        del working_dummies
        del final_dummies
        final_dummies = pd.DataFrame()
        print('new 100')

# Join end rows    
# Load
path = r'working_problem_dummies_2023.csv'
working_dummies = pd.read_csv(path)
# Concat
final_working_problem_dummies = pd.concat([working_dummies, final_dummies], ignore_index=True)
# Save
final_working_problem_dummies.to_csv('chronic_switch_problem_dummies_2023.csv', index=False)

In [None]:
# Fillna
final_working_problem_dummies.fillna(0, inplace=True)

# Find missing columns in df2 compared to df1
missing_columns = problem_dummies.columns.difference(final_working_problem_dummies.columns)

# Add missing columns to df2 and fill with zeros
for col in missing_columns:
    final_working_problem_dummies[col] = 0

# Reorder columns in df2 to match the order of columns in df1
final_working_problem_dummies = final_working_problem_dummies[problem_dummies.columns]

In [105]:
# Save
#final_working_problem_dummies.to_csv('chronic_switch_problem_dummies_2023.csv', index=False)