# Package and Tables

In [None]:
import os
import numpy as np
import pandas as pd
from google.cloud import bigquery
import seaborn as sns
import matplotlib.pyplot as plt
import subprocess



pd.set_option('max_colwidth', 800) # show all contents

In [None]:
my_bucket=os.getenv('WORKSPACE_BUCKET')
my_bucket

In [None]:
DATASET=os.environ["WORKSPACE_CDR"]
DATASET

In [None]:
cond_table = DATASET+".condition_occurrence"
drug_table = DATASET+".drug_exposure"
visit_table = DATASET+".visit_occurrence"
person_table = DATASET+".person"
concept_table = DATASET+".concept"
statin_list = ["statin","atorvastatin","simvastatin","rosuvastatin","pitavastatin","fluvastatin","lovastatin","pravastatin","lipitor","zocor","crestor","livalo","lescol","mevacor","pravachol"]

# Search for patients on statins in drug_exposure table

In [None]:
query=f"""

SELECT *

FROM {person_table} AS person

LIMIT 5

"""
df=pd.read_gbq(query, dialect='standard')
df.columns

In [None]:
query=f"""

SELECT *

FROM {drug_table} AS Drug

LIMIT 5

"""
df=pd.read_gbq(query, dialect='standard')
df

In [None]:
query=f"""

SELECT *

FROM {concept_table} AS concept

WHERE concept_id in (1585840,1585839,1586146,1586143)

limit 20

"""
df=pd.read_gbq(query, dialect='standard')
df

In [None]:
query=f"""

SELECT *

FROM {concept_table} AS concept

WHERE LOWER(concept_name) IN ({", ".join(["'"+s.lower()+"'" for s in statin_list])}) AND domain_id = "Drug"

"""
df=pd.read_gbq(query, dialect='standard')
drug_list = df['concept_id'].tolist()
drug_list_str = ', '.join([str(id) for id in drug_list])
drug_list_str

In [None]:
sql_drug = f'''
SELECT 
    drug.person_id,
    MIN(drug.drug_exposure_start_date) AS drug_date,
    person.gender_source_concept_id AS gender,
    person.race_source_concept_id AS race,
    CAST(DATE_DIFF(MAX(visit.visit_start_date), DATE(person.birth_datetime), DAY)/365.25 AS INT) AS age,
    DATE_DIFF(MAX(visit.visit_start_date), MIN(visit.visit_start_date), YEAR) AS ehr_length
FROM  {drug_table} AS drug 
INNER JOIN  {visit_table} AS visit ON visit.person_id = drug.person_id
INNER JOIN {person_table} AS person ON person.person_id = drug.person_id
WHERE drug_source_concept_id IN ({drug_list_str})  AND 
      person.gender_source_concept_id IN (1585840,1585839) AND person.race_source_concept_id IN (1586146,1586143)
GROUP BY  
    drug.person_id, person.gender_source_concept_id, person.race_source_concept_id, person.birth_datetime
HAVING 
    MIN(drug.drug_exposure_start_date) = (
        SELECT MIN(drug_exposure_start_date)
        FROM {drug_table}
        WHERE person_id = drug.person_id
        AND drug_source_concept_id IN ({drug_list_str})
    )
    AND CAST(DATE_DIFF(MAX(visit.visit_start_date), DATE(person.birth_datetime), DAY)/365.25 AS INT) >= 18    
'''

# Execute the query and display the first few rows of the DataFrame
df_drug = pd.read_gbq(sql_drug, dialect='standard')
df_drug['gender'] = df_drug['gender'].replace({1585840: "F", 1585839: "M"})
df_drug['race'] = df_drug['race'].replace({1586146: "W", 1586143: "B"})
df_drug

In [None]:
distinct_person_drug = df_drug['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_person_drug}")

In [None]:
# Plot histogram
df_drug['age'].hist(bins=10)
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age Distribution')
plt.savefig("age_case.png") 
plt.show()

In [None]:
# Calculate mean
mean_age = df_drug['age'].mean()

# Calculate first quantile (25th percentile)
first_quantile_age = df_drug['age'].quantile(0.25)

# Calculate median (50th percentile)
median_age = df_drug['age'].median()

# Calculate third quantile (75th percentile)
third_quantile_age = df_drug['age'].quantile(0.75)

print(f"Mean: {mean_age}")
print(f"First Quantile: {first_quantile_age}")
print(f"Median: {median_age}")
print(f"Third Quantile: {third_quantile_age}")

# Search for ICD codes in condition_occurence table

In [None]:
df_code = pd.read_csv("ICD_condition.csv")
icds = tuple(df_code["code"])

In [None]:
query=f"""

SELECT *

FROM {cond_table} 

WHERE condition_source_value = '293.84'

LIMIT 5

"""
df=pd.read_gbq(query, dialect='standard')
df

In [None]:
#icd code from condition table appeared in icd_condition

sql_icd = f'''
SELECT 
    ICD.person_id,
    ICD.ICD_date,
    person.gender_source_concept_id AS gender,
    person.race_source_concept_id AS race,
    CAST(DATE_DIFF(MAX(visit.visit_start_date), DATE(person.birth_datetime), DAY)/365.25 AS INT) AS age,
    DATE_DIFF(MAX(visit.visit_start_date), MIN(visit.visit_start_date), YEAR) AS ehr_length
FROM  ( SELECT person_id, MIN(ICD_date) AS ICD_date
        FROM (
            SELECT 
                condition.person_id AS person_id, 
                condition.condition_source_value AS ICD_code, 
                MIN(condition.condition_start_date) AS ICD_date
            FROM 
                {cond_table} AS condition
            WHERE condition.condition_source_value IN {icds}
            GROUP BY
                condition.person_id, condition.condition_source_value
           ) AS subquery
       GROUP BY person_id) AS ICD
INNER JOIN  {visit_table} AS visit ON visit.person_id = ICD.person_id
INNER JOIN {person_table} AS person ON person.person_id = ICD.person_id
WHERE person.gender_source_concept_id IN (1585840,1585839) AND person.race_source_concept_id IN (1586146,1586143)
GROUP BY  
    ICD.person_id, ICD.ICD_date, person.gender_source_concept_id, person.race_source_concept_id, person.birth_datetime
HAVING CAST(DATE_DIFF(MAX(visit.visit_start_date), DATE(person.birth_datetime), DAY)/365.25 AS INT) >= 18 
'''

df_icd = pd.read_gbq(sql_icd, dialect='standard')
df_icd['gender'] = df_icd['gender'].replace({1585840: "F", 1585839: "M"})
df_icd['race'] = df_icd['race'].replace({1586146: "W", 1586143: "B"})
df_icd

In [None]:
distinct_person_icd = df_icd['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_person_icd}")

# Create case cohort

In [None]:
# Assuming df_drug and df_icd are pandas DataFrames
df_depress = pd.merge(df_drug, df_icd, on='person_id', how='inner')

columns = ["person_id", "gender", "race", "age", "ehr_length"]

df_case = df_drug[columns]

# Display the result
df_case

In [None]:
df_depress.head()

In [None]:
distinct_person_depress = df_depress['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_person_depress}")

In [None]:
depress_rate = distinct_person_depress /distinct_person_drug

print(f"Depress rate in Case: {depress_rate}")

# Create control cohort

In [None]:
case_id = tuple(df_case["person_id"])

In [None]:
sql_control = f'''
SELECT 
    visit.person_id,
    person.gender_source_concept_id AS gender,
    person.race_source_concept_id AS race,
    CAST(DATE_DIFF(MAX(visit.visit_start_date), DATE(person.birth_datetime), DAY)/365.25 AS INT) AS age,
    DATE_DIFF(MAX(visit.visit_start_date), MIN(visit.visit_start_date), YEAR) AS ehr_length
FROM {visit_table} AS visit 
INNER JOIN {person_table} AS person ON person.person_id = visit.person_id
WHERE visit.person_id NOT IN {case_id}  AND 
      person.gender_source_concept_id IN (1585840,1585839) AND person.race_source_concept_id IN (1586146,1586143)
GROUP BY  
    visit.person_id, person.gender_source_concept_id, person.race_source_concept_id, person.birth_datetime
HAVING 
     CAST(DATE_DIFF(MAX(visit.visit_start_date), DATE(person.birth_datetime), DAY)/365.25 AS INT) >= 18    
'''

# Execute the query and display the first few rows of the DataFrame
df_control = pd.read_gbq(sql_control, dialect='standard')
df_control['gender'] = df_control['gender'].replace({1585840: "F", 1585839: "M"})
df_control['race'] = df_control['race'].replace({1586146: "W", 1586143: "B"})
df_control

In [None]:
distinct_person_control = df_control['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_person_control}")

# Save files

In [None]:
df_control.to_csv("control.csv")

In [None]:
df_case.to_csv("case.csv")

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = df_control

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'control.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr


In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = df_case

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'case.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr

# depreesion rate in matched case and control (after P2)

In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'matched_controls.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
matched_controls = pd.read_csv(name_of_file_in_bucket)
matched_controls


In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'matched_cases.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
matched_cases = pd.read_csv(name_of_file_in_bucket)
matched_cases

In [None]:
distinct_matched_control = matched_controls['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_matched_control}")

In [None]:
distinct_matched_case = matched_cases['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_matched_case}")

In [None]:
matched_control_id = tuple(matched_controls["person_id"])

In [None]:
sql_control_depress = f'''
  SELECT subquery.person_id
  FROM (
        SELECT 
            condition.person_id AS person_id, 
            condition.condition_source_value AS ICD_code
        FROM 
            {cond_table} AS condition
        WHERE 
        condition.condition_source_value IN {icds}) As subquery
   WHERE subquery.person_id IN {matched_control_id}
'''
df_control_depress = pd.read_gbq(sql_control_depress, dialect='standard')
df_control_depress

In [None]:
distinct_depress_control = df_control_depress['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_depress_control}")

In [None]:
depress_control_rate = distinct_depress_control /distinct_matched_control

print(f"Depress rate in Control: {depress_control_rate}")

In [None]:
matched_case_id = tuple(matched_cases["person_id"])

In [None]:
sql_case_depress = f'''
  SELECT subquery.person_id
  FROM (
        SELECT 
            condition.person_id AS person_id, 
            condition.condition_source_value AS ICD_code
        FROM 
            {cond_table} AS condition
        WHERE 
        condition.condition_source_value IN {icds}) As subquery
   WHERE subquery.person_id IN {matched_case_id}
'''
df_case_depress = pd.read_gbq(sql_case_depress, dialect='standard')
df_case_depress

In [None]:
sql_case_depress_before = f'''
    SELECT 
        depress.person_id,
        depress.ICD_date,
        statin.drug_date
    FROM (
        SELECT 
            ICD.person_id,
            ICD.ICD_date
        FROM (
            SELECT 
                condition.person_id, 
                MIN(condition.condition_start_date) AS ICD_date
            FROM 
                {cond_table} AS condition
            WHERE 
                condition.condition_source_value IN {icds}
            GROUP BY 
                condition.person_id
        ) AS ICD
    ) AS depress
    INNER JOIN (
        SELECT 
            drug.person_id,
            MIN(drug.drug_exposure_start_date) AS drug_date
        FROM 
            {drug_table} AS drug
        WHERE 
            drug.drug_source_concept_id IN ({drug_list_str})
        GROUP BY 
            drug.person_id
        HAVING 
            MIN(drug.drug_exposure_start_date) = (
                SELECT 
                    MIN(drug_exposure_start_date)
                FROM 
                    {drug_table}
                WHERE 
                    person_id = drug.person_id AND 
                    drug_source_concept_id IN ({drug_list_str}) -- Adjusted: Reference the specific drug concept within subquery
            )
    ) AS statin
    ON statin.person_id = depress.person_id
    WHERE 
        statin.person_id IN {matched_case_id} AND depress.ICD_date > statin.drug_date
'''
df_case_depress_before = pd.read_gbq(sql_case_depress_before, dialect='standard')
df_case_depress_before

In [None]:
distinct_depress_case = df_case_depress['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_depress_case}")

In [None]:
distinct_depress_case_before = df_case_depress_before['person_id'].nunique()

print(f"Number of distinct values in 'person_id': {distinct_depress_case_before}")

In [None]:
depress_case_rate = distinct_depress_case /distinct_matched_case

print(f"Depress rate in Case: {depress_case_rate}")

In [None]:
depress_case_rate_before = distinct_depress_case_before /distinct_matched_case

print(f"Depress rate in Case: {depress_case_rate_before}")