In [None]:
import pandas as pd
import numpy as np
import scipy
import sklearn
import matplotlib as mpl
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess

In [None]:
bucket = os.getenv("WORKSPACE_BUCKET")

CDR = os.environ["WORKSPACE_CDR"]

In [None]:
bucket

In [None]:
CDR

Pull negative covid test cohort 

In [None]:
negative_cohort_sql = f"""
with Negative_Results as (
select concept_id from `{CDR}`.concept
where concept_id IN (45880296, 45878583, 45884153, 45884092, 45878245, 4305306)
),
COVID_Tests as (
select concept_id from `{CDR}`.concept
where concept_id IN (586520,586523,586525,586526,586529,706157,706159,715261,715272,723470,723472,757678,36032061,36032174,36032258,36661371,586518,586524,706154,706175,723464,723467,723478,36031453,586516,706158,706160,706163,706171,706172,715260,723469,36031213,36661377,586528,706161,706165,706167,723463,723468,723471,757677,36031238,36031944,586519,706166,706169,706173,723465,723476,757685,36031506,706155,706156,706170,723466,36031652,36661370,706168,706174,715262,723477,36032419,36661378,37310257)
),
COVID_Lab_Index as (
SELECT m.person_id, min(m.measurement_date) as neg_test_idx
FROM `{CDR}`.measurement m JOIN COVID_Tests p on m.measurement_concept_id = p.concept_id
WHERE m.value_as_concept_id in (select distinct concept_id from Negative_Results)
GROUP BY m.person_id
)
select * from COVID_Lab_Index
"""

In [None]:
negative_cohort_df = pd.read_gbq(negative_cohort_sql, dialect="standard", use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),  progress_bar_type="tqdm_notebook")

In [None]:
negative_cohort_df

In [None]:
negative_cohort_df = negative_cohort_df[(~negative_cohort_df.person_id.isin(positive_aou_cohort.person_id))]

In [None]:
wearable_data_tables = ['activity_summary', 'heart_rate_minute_level', 'heart_rate_summary', 'steps_intraday', 'sleep_level', 'sleep_daily_summary']

In [None]:
wearable_data_cohort_sql = f"""
with activity_summary_cohort as
(select distinct person_id from `{CDR}`.activity_summary),
hrml_cohort as
(select distinct person_id from `{CDR}`.heart_rate_minute_level),
hrs_cohort as
(select distinct person_id from `{CDR}`.heart_rate_summary),
steps_intraday_cohort as
(select distinct person_id from `{CDR}`.steps_intraday),
sleep_level_cohort as
(select distinct person_id from `{CDR}`.sleep_level),
sleep_daily_summary_cohort as
(select distinct person_id from `{CDR}`.sleep_daily_summary)
select person_id from activity_summary_cohort
union distinct 
select person_id from hrml_cohort
union distinct 
select person_id from hrs_cohort
union distinct 
select person_id from steps_intraday_cohort
union distinct 
select person_id from sleep_level_cohort
union distinct
select person_id from sleep_daily_summary_cohort
"""

In [None]:
wearable_data_cohort_df = pd.read_gbq(wearable_data_cohort_sql, dialect="standard", use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),  progress_bar_type="tqdm_notebook")

In [None]:
wearable_data_cohort_df.shape

In [None]:
wearable_data_cohort_df

In [None]:
negative_covid_cohort_with_wearable_data = negative_cohort_df.merge(wearable_data_cohort_df, how = 'inner' ,indicator=False)

In [None]:
negative_covid_cohort_with_wearable_data.shape

In [None]:
people = tuple(negative_covid_cohort_with_wearable_data['person_id'])

In [None]:
steps_summary_sql = f"""select * from `{CDR}`.activity_summary where person_id in {people} """
steps_summary_df = pd.read_gbq(steps_summary_sql, dialect="standard", use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),  progress_bar_type="tqdm_notebook")

In [None]:
steps_summary_df

In [None]:
steps_summary_df['date']= pd.to_datetime(steps_summary_df['date'])

negative_covid_cohort_with_wearable_data['neg_test_idx'] = pd.to_datetime(negative_covid_cohort_with_wearable_data['neg_test_idx'])

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
from statistics import median

steps_stat = []

for index, row in negative_covid_cohort_with_wearable_data.iterrows():
    person_id = row['person_id']
    min_covid_dt = row['neg_test_idx']
    
    min_covid_dt_minus_7 = min_covid_dt - relativedelta(days=7)
    min_covid_dt_plus_28 = min_covid_dt + relativedelta(days=28)
    
    p_data = steps_summary_df[steps_summary_df['person_id'] == person_id]
    
    p_data['date']= pd.to_datetime(p_data['date'])
    
    
    if not p_data.empty:
        
        prev_df = p_data[(p_data['date'] < min_covid_dt_minus_7)]
        next_df = p_data[p_data['date'] > min_covid_dt_plus_28]
        
        prev_steps = list(prev_df['steps'])

        prev_steps.sort()

        next_steps = list(next_df['steps'])

        next_steps.sort()

        p_m = 0
        n_m = 0

        if len(prev_steps) > 0:
            p_m = median(prev_steps)

        if len(next_steps) > 0:
            n_m = median(next_steps)


        steps_stat.append([person_id, min_covid_dt, p_m, n_m])

In [None]:
steps_stat_df = pd.DataFrame(steps_stat, columns=['person_id', 'neg_covid_dt', 'before_neg_covid_test_median_steps', 'after_negative_covid_test_median_steps'])

In [None]:
steps_stat_df

In [None]:
destination_filename = 'negative_cohort_steps_median.csv'

# save dataframe in a csv file in the same workspace as the notebook
steps_stat_df.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr

In [None]:
hrml_sql = f"""select person_id, date(datetime) as date, avg(heart_rate_value) as average_heart_rate from `{CDR}`.heart_rate_minute_level where person_id in {people} group by 1,2"""
hrml_df = pd.read_gbq(hrml_sql, dialect="standard", use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),  progress_bar_type="tqdm_notebook")

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
from statistics import median

heart_rate_minute_level_stat = []

for index, row in negative_covid_cohort_with_wearable_data.iterrows():
    person_id = row['person_id']
    min_covid_dt = row['neg_test_idx']
      
    min_covid_dt_minus_7 = min_covid_dt - relativedelta(days=7)
    min_covid_dt_plus_28 = min_covid_dt + relativedelta(days=28)
    
    p_data = hrml_df[hrml_df['person_id'] == person_id]
    
    p_data['date']= pd.to_datetime(p_data['date'])
    
    
    if not p_data.empty:
        
        prev_df = p_data[p_data['date'] < min_covid_dt_minus_7]
        next_df = p_data[p_data['date'] > min_covid_dt_plus_28]
        prev_hr = list(prev_df['average_heart_rate'])

        prev_hr.sort()

        next_hr = list(next_df['average_heart_rate'])

        next_hr.sort()

        p_m = 0
        n_m = 0

        if len(prev_hr) > 0:
            p_m = median(prev_hr)

        if len(next_hr) > 0:
            n_m = median(next_hr)


        heart_rate_minute_level_stat.append([person_id, min_covid_dt, p_m, n_m])

In [None]:
heart_rate_minute_level_df = pd.DataFrame(heart_rate_minute_level_stat, columns=['person_id', 'neg_covid_dt', 'before_neg_covid_avg_heart_rate', 'after_neg_covid_avg_heart_rate'])

In [None]:
destination_filename = 'negative_cohort_avg_heart_rate_median.csv'

# save dataframe in a csv file in the same workspace as the notebook
heart_rate_minute_level_df.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr


#heart_rate_minute_level_df.to_csv('avg_heart_rate_median.csv', index=False)

In [None]:
hr_summary_sql = f"""select * from `{CDR}`.heart_rate_summary where person_id in {people} """
hr_summary_df = pd.read_gbq(hr_summary_sql, dialect="standard", use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),  progress_bar_type="tqdm_notebook")

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
from statistics import median

heart_rate_min_max_stat = []

for index, row in negative_covid_cohort_with_wearable_data.iterrows():
    person_id = row['person_id']
    min_covid_dt = row['neg_test_idx']
     
    min_covid_dt_minus_7 = min_covid_dt - relativedelta(days=7)
    min_covid_dt_plus_28 = min_covid_dt + relativedelta(days=28)
    
    p_data = hr_summary_df[hr_summary_df['person_id'] == person_id]
    
    p_data['date']= pd.to_datetime(p_data['date'])
    
    
    if not p_data.empty:
        
        prev_df = p_data[p_data['date'] < min_covid_dt_minus_7]
        next_df = p_data[p_data['date'] > min_covid_dt_plus_28 ]
        
        prev_min_hr = list(prev_df['min_heart_rate'])
        prev_max_hr = list(prev_df['max_heart_rate'])

        prev_min_hr.sort()
        prev_max_hr.sort()

        next_min_hr = list(next_df['min_heart_rate'])
        next_max_hr = list(next_df['max_heart_rate'])

        next_min_hr.sort()
        next_max_hr.sort()

        p_min_m = 0
        p_max_m = 0
        
        n_min_m = 0
        n_max_m = 0

        if len(prev_min_hr) > 0:
            p_min_m = median(prev_min_hr)
            
        if len(prev_max_hr) > 0:
            p_max_m = median(prev_max_hr)

        if len(next_min_hr) > 0:
            n_min_m = median(next_min_hr)
            
        if len(next_max_hr) > 0:
            n_max_m = median(next_max_hr)    


        heart_rate_min_max_stat.append([person_id, min_covid_dt, p_min_m, p_max_m, n_min_m, n_max_m])

In [None]:
heart_rate_min_max_df = pd.DataFrame(heart_rate_min_max_stat, columns=['person_id', 'neg_covid_dt', 'before_neg_covid_min_heart_rate', 'before_neg_covid_max_heart_rate', 'after_neg_covid_min_heart_rate', 'after_neg_covid_max_heart_rate'])

In [None]:
destination_filename = 'negative_cohort_heart_rate_min_max_median.csv'

# save dataframe in a csv file in the same workspace as the notebook
heart_rate_min_max_df.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)


#heart_rate_min_max_df.to_csv('heart_rate_min_max_median.csv', index=False)

In [None]:
sleep_summary_sql = f"""select * from `{CDR}`.sleep_daily_summary where person_id in {people} """
sleep_summary_df = pd.read_gbq(sleep_summary_sql, dialect="standard", use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),  progress_bar_type="tqdm_notebook")

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
from statistics import median

sleep_summary_stat = []

for index, row in negative_covid_cohort_with_wearable_data.iterrows():
    person_id = row['person_id']
    min_covid_dt = row['neg_test_idx']
    
    min_covid_dt_minus_7 = min_covid_dt - relativedelta(days=7)
    min_covid_dt_plus_28 = min_covid_dt + relativedelta(days=28)
    
    p_data = sleep_summary_df[sleep_summary_df['person_id'] == person_id]
    
    p_data['sleep_date']= pd.to_datetime(p_data['sleep_date'])
    
    
    if not p_data.empty:
        
        prev_df = p_data[p_data['sleep_date'] < min_covid_dt_minus_7]
        next_df = p_data[p_data['sleep_date'] > min_covid_dt_plus_28]
        
        prev_hr = list(prev_df['minute_asleep'])

        prev_hr.sort()

        next_hr = list(next_df['minute_asleep'])

        next_hr.sort()

        p_m = 0
        n_m = 0

        if len(prev_hr) > 0:
            p_m = median(prev_hr)

        if len(next_hr) > 0:
            n_m = median(next_hr)


        sleep_summary_stat.append([person_id, min_covid_dt, p_m, n_m])

In [None]:
sleep_summary_stat_df = pd.DataFrame(sleep_summary_stat, columns=['person_id', 'neg_covid_dt', 'before_neg_covid_median_minutes_asleep', 'after_neg_covid_median_minutes_asleep'])

In [None]:
destination_filename = 'negative_cohort_sleep_summary_stat_df.csv'

# save dataframe in a csv file in the same workspace as the notebook
sleep_summary_stat_df.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

#sleep_summary_stat_df.to_csv('sleep_summary_stat_df.csv', index=False)

In [None]:
fitbit_wore_time_sql = f"""SELECT person_id, AVG(hours_with_fitbit) as average_weartime from
(SELECT person_id, date, SUM(has_hour) AS hours_with_fitbit FROM (SELECT person_id, CAST(datetime AS DATE) AS date, 
IF(SUM(steps)>0, 1, 0) AS has_hour FROM `{CDR}`.steps_intraday where person_id in {people} GROUP BY CAST(datetime AS DATE), EXTRACT(HOUR FROM datetime), person_id) t GROUP BY date, person_id)
GROUP BY person_id"""

fitbit_wore_time_df = pd.read_gbq(fitbit_wore_time_sql, dialect="standard", use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),  progress_bar_type="tqdm_notebook")

In [None]:
destination_filename = 'negative_cohort_fitbit_avg_weartime.csv'

# save dataframe in a csv file in the same workspace as the notebook
fitbit_wore_time_df.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

#fitbit_wore_time_df.to_csv('fitbit_avg_weartime.csv', index=False)

In [None]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'n3c_aou_cohort.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
positive_aou_cohort = pd.read_csv(name_of_file_in_bucket)
positive_aou_cohort.head()


In [None]:
# This snippet assumes that you run setup first

# This code lists objects in your Google Bucket

# Get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# List objects in the bucket
print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8'))




No COVID Test Cohort

In [None]:
no_covid_cohort_sql = f"""
with COVID_Tests as (
select concept_id from `{CDR}`.concept
where concept_id IN (586520,586523,586525,586526,586529,706157,706159,715261,715272,723470,723472,757678,36032061,36032174,36032258,36661371,586518,586524,706154,706175,723464,723467,723478,36031453,586516,706158,706160,706163,706171,706172,715260,723469,36031213,36661377,586528,706161,706165,706167,723463,723468,723471,757677,36031238,36031944,586519,706166,706169,706173,723465,723476,757685,36031506,706155,706156,706170,723466,36031652,36661370,706168,706174,715262,723477,36032419,36661378,37310257)
),
u07_any as (
SELECT co.person_id
FROM `{CDR}`.condition_occurrence co 
WHERE co.condition_concept_id not in 37311061 
GROUP BY co.person_id
),
COVID_Lab_Index as (
SELECT m.person_id
FROM `{CDR}`.measurement m where m.measurement_concept_id not in (select distinct concept_id from COVID_Tests)
)
select * from COVID_Lab_Index
"""