# Import Packages/Codes

In [None]:
%load_ext google.cloud.bigquery
from datetime import date
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil
from IPython.display import display, HTML
import os
import subprocess

In [None]:
dataset = os.getenv("WORKSPACE_CDR")
dataset

In [None]:
cwd = os.getcwd()
cwd

# Build Acidosis Cohort

## Obtain data

In [None]:
# Find the ICD condition codes for acidosis 
#Acidosis
acidosis = {'9': ['276.2'], 
       '10': ['E87.2']}

In [None]:
# Obtain demographics data

In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'demographic_all.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
demo_patients = pd.read_csv(name_of_file_in_bucket)
demo_patients

In [None]:
# Obtain medication data

In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'Metformin_Medication_AG_10122023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
df_med = pd.read_csv(name_of_file_in_bucket)
df_med

In [None]:
df_med['person_id'].nunique()

## Construct Where Statement (SQL, matching ICD codes)

In [None]:
# ICD and diag code for Where statement
def where_sub_sql(codes_dict, num):
    sub_where = "(("   
    # get the diag LIKE ... OR diag LIKE ...
    for i, code in enumerate(list(codes_dict[num])):
        if i != 0: # Skip adding "OR" in the first code
            sub_where += "OR "
        sub_where += f"concept_code LIKE '{code}%' "
    sub_where += f") AND vocabulary_id = 'ICD{num}CM'"
    sub_where += ")"
    return sub_where

In [None]:
# get the where condition for the codes_dict
def where_sql(codes_dict):
    # get the sub where condition when icd_flag = '9'
    where_sql = where_sub_sql(codes_dict, '9')
    where_sql += " OR "
    # get the sub where condition when icd_flag = '10'
    where_sql += where_sub_sql(codes_dict, '10')
    where_sql += ")"
    return where_sql

In [None]:
where_condition = where_sql(acidosis)
print(where_condition)

## 2.3. Function to Query Patients by ICD 9&10 Major Type

In [None]:
#Query person ids and condition start dates by specific ICD 9&10 diagnosis codes.
def query_by_icd(where_statement):  
    query = ("""
                SELECT distinct person_id, condition_start_date as date, concept_code, vocabulary_id  
                FROM 
                    (SELECT DISTINCT person_id, condition_source_concept_id, condition_source_value, condition_start_date
                        FROM `"""+dataset+""".condition_occurrence`) AS cond 
                     INNER JOIN 
                        (SELECT DISTINCT concept_id, concept_name, concept_code, vocabulary_id 
                            FROM `"""+dataset+""".concept` 
                            where """+where_statement+""" as concept
                            on concept.concept_id = cond.condition_source_concept_id
    """)
    df_condition= pd.read_gbq(query, dialect="standard")
    return df_condition

In [None]:
df_diagnostic_condition = query_by_icd(where_condition)
df_diagnostic_condition

In [None]:
# Only see records in ICD 9 vocabulary
df_diagnostic_condition[df_diagnostic_condition['vocabulary_id'] == 'ICD9CM']

In [None]:
# Only see records in ICD 10 vocabulary
df_diagnostic_condition[df_diagnostic_condition['vocabulary_id'] == 'ICD10CM']

## 2.4. Function to Sort Unique Patients by the Earliest Date

In [None]:
#Sort unique person ids by their earliset condition date.
def sort_unique_by_min_date(df): 
    min_dates_diags = df.sort_values(["person_id","date"]).groupby("person_id", as_index=False).first()
    return min_dates_diags

In [None]:
min_dates_diags = sort_unique_by_min_date(df_diagnostic_condition)
min_dates_diags

# Construct Final dataset

In [None]:
min_dates_diags = pd.merge(min_dates_diags, demo_patients, on ="person_id")
min_dates_diags

In [None]:
# Filter patients who used metformin drugs
min_dates_diags = pd.merge(min_dates_diags, df_med, on ="person_id", how='inner')
min_dates_diags

In [None]:
min_dates_diags['person_id'].nunique()

In [None]:
metformin_acidosis_AG_10132023 = min_dates_diags.drop_duplicates(subset='person_id',keep='first')
metformin_acidosis_AG_10132023

In [None]:
metformin_acidosis_AG_10132023 = metformin_acidosis_AG_10132023.reset_index(drop=True)
metformin_acidosis_AG_10132023

In [None]:
# Upload dataset to Google Bucket

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = metformin_acidosis_AG_10132023   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'metformin_acidosis_AG_10132023.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr

In [None]:
# This snippet assumes that you run setup first

# This code lists objects in your Google Bucket

# Get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# List objects in the bucket
print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8'))

# Visualizations

## Clean data

In [None]:
cleaned_demo = metformin_acidosis_AG_10132023[['person_id','year_of_birth','gender_source_value','sex_at_birth_source_value','race_source_value', 'ethnicity_source_value']]


cleaned_demo = cleaned_demo.rename(columns={'person_id':"Count",
                                            'year_of_birth':"Age",
                                            'gender_source_value': "Gender",
                                            'sex_at_birth_source_value':"Sex at Birth",
                                            'race_source_value':"Race",
                                            'ethnicity_source_value':"Hispanic"})


for row in (range(cleaned_demo.shape[0])):
    for col in (range(cleaned_demo.shape[1])):
        if cleaned_demo.iloc[row,col] == "PMI_Skip":
            cleaned_demo.iloc[row,col] = "Skip"
        if cleaned_demo.iloc[row,col] in ["PMI_PreferNotToAnswer",
                                          "SexAtBirth_Intersex",
                                          "SexAtBirth_SexAtBirthNoneOfThese",
                                          "No matching concept",
                                          "GenderIdentity_AdditionalOptions",
                                          "GenderIdentity_GeneralizedDiffGender",
                                          "GenderIdentity_NonBinary",
                                          "GenderIdentity_Transgender"]:
            cleaned_demo.iloc[row,col] = "Unspecified"


for x in range(len(cleaned_demo)):
    birth_year = cleaned_demo.at[x,'Age']
    cleaned_demo.at[x,'Age'] = date.today().year - birth_year

bins = [0,29,49,69,89,1000]
labels = ['0-29','30-49','50-69','70-89','90+']
cleaned_demo['Age Group'] = pd.cut(cleaned_demo['Age'], bins=bins, labels=labels, right=False)

In [None]:
cleaned_demo

## Sex at Birth & Gender IdentityNew heading

In [None]:
count_sex_gr = cleaned_demo[['Count','Sex at Birth']].groupby(['Sex at Birth'], as_index=False).count()
count_sex_gr['%'] = 100 * count_sex_gr['Count'] / len(cleaned_demo)
display(count_sex_gr)

fig1, ax1 = plt.subplots()
ax1.pie(cleaned_demo['Sex at Birth'].value_counts(), labels=['SexAtBirth_Female','SexAtBirth_Male','Unspecified','Skip'], 
        autopct='%1.1f%%', startangle=90, pctdistance=0.5)
ax1.axis('equal')
fig = plt.gcf().gca().add_artist(plt.Circle((0,0),0.70,fc='white'))
plt.tight_layout()
#rcParams['figure.figsize'] = (10,10)
plt.show()

In [None]:
count_gender_gr = cleaned_demo[['Count','Gender']].groupby(['Gender'], as_index=False).count()
count_gender_gr['%'] = 100 * count_gender_gr['Count'] / len(cleaned_demo)
display(count_gender_gr)

fig1, ax1 = plt.subplots()
ax1.pie(cleaned_demo['Gender'].value_counts(), labels=['GenderIdentity_Woman','GenderIdentity_Man','Unspecified','Skip'], 
        autopct='%1.1f%%', startangle=90, pctdistance=0.5)
ax1.axis('equal')
fig = plt.gcf().gca().add_artist(plt.Circle((0,0),0.70,fc='white'))
plt.tight_layout()
#rcParams['figure.figsize'] = (10,10)
plt.show()

## Race and Ancestry

In [None]:
count_race_gr = cleaned_demo[['Count','Race']].groupby(['Race'], as_index=False).count()
count_race_gr['%'] = 100 * count_race_gr['Count'] / len(cleaned_demo)
display(count_race_gr)

sns.barplot(x='Count', y='Race',data=cleaned_demo[['Count','Race']].groupby(['Race'], as_index=False).count());

## Ethnicity

In [None]:
count_eth_gr = cleaned_demo[['Count','Hispanic']].groupby(['Hispanic'], as_index=False).count()
count_eth_gr['%'] = 100 * count_eth_gr['Count'] / len(cleaned_demo)
display(count_eth_gr)

fig1, ax1 = plt.subplots()
ax1.pie(cleaned_demo['Hispanic'].value_counts(),  #labels=['Not HLS','Hispanic, Latino, or Spanish', 'Skip', 'None of These', 'Prefer not to Answer'],
        autopct='%1.1f%%', startangle=0, pctdistance=0.5)
ax1.axis('equal')
fig = plt.gcf().gca().add_artist(plt.Circle((0,0),0.70,fc='white'))
plt.tight_layout()
#rcParams['figure.figsize'] = (10,10)
plt.show();

## Age Distribution

In [None]:
count_age_gr = cleaned_demo[['Count','Age Group']].groupby(['Age Group'], as_index=False).count()
count_age_gr['%'] = 100 * count_age_gr['Count'] / len(cleaned_demo)
display(count_age_gr)


#rcParams['figure.figsize'] = (20,10)
sns.histplot(cleaned_demo['Age']);

summary = cleaned_demo.groupby('Age Group')['Age'].describe()[['min', '25%', '50%', '75%', 'max']]
summary