In [1]:
from IPython.display import JSON
from truveta.study import Client
client = Client()

import pathlib #this is a package for finding filepaths. 
import os
import pandas as pd 
import numpy as np 
from datetime import date
 
# Returns the current local date
today = date.today()

print("Today date is: ", today)

To sign in to Truveta, use a web browser to open the page https://login.truveta.com/activate?user_code=XXHF-ZXLM (User code: XXHF-ZXLM).
Signed in successfully.
Initialized Truveta Study Client (TraceId: d6438d8aaa0f1e739e3e80547e1b56a org: org_TfbnAclWXIW7puUv workspace: rg1)
Today date is:  2023-06-26


In [2]:
#Indicate study name: 
st = client.get_study(title="Hypertension Cohort")
assert st is not None

#Indicate population name: 
p = st.get_population(title="Study Cohort")
assert p is not None



In [3]:
p.get_snapshots()
#This gets the population snapshot.

population = p.get_snapshots()[0]
population.get_status()

{'id': 'ps-fxx24ulya56urj6qhcj5isknxi',
 'status': 'Finished',
 'currentStage': None,
 'totalStages': 9,
 'percentComplete': 100}

In [5]:
#what tables are there
schema = population.get_data_dictionary()
schema.groupby('table').count()

This call may take up to 120 seconds


Unnamed: 0_level_0,column,column_type
table,Unnamed: 1_level_1,Unnamed: 2_level_1
Claim,13,13
ClaimLine,6,6
ClaimLineCodes,4,4
Concept,7,7
ConceptLink,2,2
Condition,14,14
ConditionCodes,4,4
DeviceUse,14,14
Encounter,11,11
ExtendedProperty,9,9


In [3]:
### This creates the data folder
data = pathlib.Path('./data')
data.mkdir(parents=True, exist_ok=True)

### This defines the load_and_dump function. 
def load_and_dump(table: str):
    print("Loading:", table)
    df = population.load_data(table=table)
    df.to_parquet(data / f"{table}.parquet", coerce_timestamps = 'us', allow_truncated_timestamps = True)

In [40]:
load_and_dump('Patient') #This has patient and person Ids
load_and_dump('Person') #This has some demographic information as well as PersonId
load_and_dump('PersonRace')

Loading: Patient
Loading: Person
Loading: PersonRace


In [5]:
load_and_dump('Concept')

Loading: Concept


In [8]:
concept = pd.read_parquet('data/Concept.parquet')
concept.head()

Unnamed: 0,ConceptId,ConceptName,ConceptDefinition,CodeSystem,ConceptCode,ConceptClass,Domain
0,3462811,"Abbott GmbH - ARCHITECT - IGM, ANTIGEN, ANTISE...","Abbott GmbH - ARCHITECT - IGM, ANTIGEN, ANTISE...",Truveta,3462811,Device,Device
1,3462812,"Abbott GmbH - ARCHITECT - KIT, SEROLOGICAL, NE...","Abbott GmbH - ARCHITECT - KIT, SEROLOGICAL, NE...",Truveta,3462812,Device,Device
2,3462813,"Abbott GmbH - ARCHITECT - KIT, SEROLOGICAL, PO...","Abbott GmbH - ARCHITECT - KIT, SEROLOGICAL, PO...",Truveta,3462813,Device,Device
3,3462814,Abbott GmbH - ARCHITECT - LDL & VLDL PRECIPITA...,Abbott GmbH - ARCHITECT - LDL & VLDL PRECIPITA...,Truveta,3462814,Device,Device
4,3462815,Abbott GmbH - ARCHITECT - LIPASE HYDROLYSIS/GL...,Abbott GmbH - ARCHITECT - LIPASE HYDROLYSIS/GL...,Truveta,3462815,Device,Device


In [9]:
#patient analysis
person = pd.read_parquet('data/Person.parquet')
person.rename({'Id': 'PersonId'}, axis=1, inplace=True)
#each entry is a unique person Id
print(person.nunique())
print(person.info())

BirthDateTime                   48900
EthnicityConceptId                  8
GenderConceptId                     6
PersonId                      1206910
IsExpired                           2
MaritalStatusConceptId             12
PreferredLanguageConceptId          1
ReligionConceptId                   1
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1206910 entries, 0 to 75431
Data columns (total 8 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   BirthDateTime               1206910 non-null  datetime64[ns]
 1   EthnicityConceptId          1206910 non-null  int32         
 2   GenderConceptId             1206910 non-null  int32         
 3   PersonId                    1206910 non-null  object        
 4   IsExpired                   1164130 non-null  object        
 5   MaritalStatusConceptId      1206910 non-null  int32         
 6   PreferredLanguageConceptId  1206910 non-

In [12]:
#get person's gender, race and ethnicity concept id list
person_gender = person.groupby('GenderConceptId',as_index=False).PersonId.count()
# print(person_gender)

#read in person race and merge
person_race = pd.read_parquet('data/PersonRace.parquet')
print(person_race.head())
person_race_name = person.merge(person_race,left_on="PersonId",right_on="PersonId",how = 'left')
person_race_name.head(10)
person_races = person_race_name.groupby('RaceConceptId',as_index=False).PersonId.count()
# print(person_races)

#get person's ethinicity count
person_eth = person.groupby('EthnicityConceptId',as_index=False).PersonId.count()
# print(person_eth)

                                     Id                              PersonId  \
0  fd04227a-57a4-ba1e-1d08-601c765706ff  0f4b77ae-9cf9-fa6e-6208-d8c241688ae0   
1  d7ee8ed6-5b9a-835e-fd8d-575bbc76b4fe  80d4aa36-be3f-e095-f516-dd33f7277809   
2  4209a6ac-702a-176d-e69d-2d7b114fb28e  3f2f5618-448f-6d22-79c6-4ee737eac23d   
3  94ba28c3-72b8-3ad7-9fc8-797539a4d91d  504c65ca-9eec-9d88-425f-787267d18406   
4  866f00a9-9d8a-93ca-1a45-1f57b3999ce5  7f125e63-141c-aa20-5653-243717f82710   

   RaceConceptId  
0      1067364.0  
1      1067364.0  
2      1067364.0  
3      1067364.0  
4      1067364.0  


In [13]:
#example of checking race-person uniqueness
df2 = person_race.groupby('PersonId').count().reset_index()
df2.groupby('RaceConceptId').count()
# df2

Unnamed: 0_level_0,PersonId,Id
RaceConceptId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,32,32
1,1203761,1203761


In [14]:
#use sql to get concept names
sql_gender = "select ConceptId, ConceptName, ConceptDefinition \
from Concept \
where ConceptId in :genderconceptid"
sql_race = "select ConceptId, ConceptName, ConceptDefinition \
from Concept \
where ConceptId in :raceconceptid"
sql_eth = "select ConceptId, ConceptName, ConceptDefinition \
from Concept \
where ConceptId in :ethnicityid"
#set parameters to be the list of concept ids
params_gen = {
    'genderconceptid':person_gender.GenderConceptId
 
}
params_race = {
     'raceconceptid':round(person_races.RaceConceptId)
}
params_eth = {
        'ethnicityid':person_eth.EthnicityConceptId
}

gender_name = population.load_sql_table(sql = sql_gender,params = params_gen)
race_name = population.load_sql_table(sql = sql_race, params = params_race)
ethnicity_name = population.load_sql_table(sql= sql_eth, params = params_eth)
print('finish loading')


finish loading


In [17]:
#merging names with counts table
def get_baseline_stats(person_tb,concept, left = "", right = "ConceptId"):
    stats = person_tb.merge(concept, left_on=left, right_on=right, how = 'left')
#add a percentage
    stats['CountPercentage%'] =round(stats.PersonId / stats.PersonId.sum() * 100,3)
    baseline = stats[[left,'ConceptName','PersonId','CountPercentage%']]
    baseline.rename(columns={'PersonId':'PersonCount'},inplace=True)
    return baseline


baseline_gender = get_baseline_stats(person_gender, gender_name, left= 'GenderConceptId', right = 'ConceptId')
baseline_ethnicity = get_baseline_stats(person_eth, ethnicity_name, left = 'EthnicityConceptId', right = 'ConceptId')
baseline_race = get_baseline_stats(person_races, race_name, left = 'RaceConceptId',right = 'ConceptId')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baseline.rename(columns={'PersonId':'PersonCount'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baseline.rename(columns={'PersonId':'PersonCount'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baseline.rename(columns={'PersonId':'PersonCount'},inplace=True)


In [51]:
print(baseline_gender)
print('======================================================')
print(baseline_ethnicity)
print('======================================================')
print(baseline_race)
print(person_races[person_races['RaceConceptId'].isna()].count())

   GenderConceptId     ConceptName  PersonCount  CountPercentage%
0          1065405          Female       637959            52.859
1          1065406            Male       565946            46.892
2          1065408           Other            7             0.001
3          1065409         Unknown           40             0.003
4          1067556          Masked         2941             0.244
5          1067561  No Information           17             0.001
   EthnicityConceptId             ConceptName  PersonCount  CountPercentage%
0             1065359      Hispanic or Latino        73247             6.069
1             1065389               Colombian           15             0.001
2             1065401  Not Hispanic or Latino      1040618            86.222
3             1067555                 Invalid          391             0.032
4             1067556                  Masked        54107             4.483
5             1067558                 Unknown        21529             1.784

In [19]:
baseline_race.PersonCount.sum()

1203761

In [6]:
sql = "\
SELECT * \
FROM SearchResult_hypertension \
"

search_result = population.load_sql_table(sql = sql)
search_result.to_parquet("data/absolute_date_time.parquet", coerce_timestamps = 'us',allow_truncated_timestamps = True)

In [7]:
import datetime as dt
search_result['ShiftedTime'] = pd.to_datetime(search_result.AbsoluteEffectiveDateTime)- pd.DatetimeIndex(search_result.RecordedDateTime)
#take a average gap for each patient
person_shift_time = search_result.groupby('PersonId')['ShiftedTime'].mean().reset_index()

In [10]:
person_age = person.merge(person_shift_time,left_on="PersonId", right_on='PersonId', how = 'left')[['PersonId','BirthDateTime','ShiftedTime']]

person_age['age_at_2019'] = round(((person_age.ShiftedTime + pd.to_datetime('2019-07-01') - person_age.BirthDateTime).dt.days)/365 ,0)


In [16]:
pd.to_datetime("2019-01")

Timestamp('2019-01-01 00:00:00')

In [24]:
person_age['AbsoluteBirthDate'] = person_age['BirthDateTime'] + person_age['ShiftedTime']
absolute_age = person_age[['PersonId', 'AbsoluteBirthDate', 'ShiftedTime']]

In [25]:

absolute_age.to_csv("data/patient_abs_birthdate.csv",header= True)

ArrowNotImplementedError: Unhandled type for Arrow to Parquet schema conversion: duration[ns]

In [44]:
#put person into age groups
def assign_age_group(age):
    if age < 18:
        return '<18'
    elif age >=18 and age <=39:
        return '18-39'
    elif age >=40 and age <=63:
        return '40-63'
    elif age >=64:
        return '64+'
    
    else:
        return -1
    
    
person_age['AgeGroup'] = person_age['age_at_2019'].apply(lambda x: assign_age_group(x))
    

In [46]:
person_age.head()
#check uniqueness
person_age.nunique()

PersonId         1206910
BirthDateTime      48900
ShiftedTime       331764
age_at_2019          104
AgeGroup               4
dtype: int64

In [47]:
age_group = person_age.groupby('AgeGroup')['PersonId'].count().reset_index()

In [52]:
age_group['Percentage'] = age_group.PersonId / age_group.PersonId.sum() *100
age_group.rename({'PersonId':'PersonCount'},inplace=True)

In [53]:
age_group


Unnamed: 0,AgeGroup,PersonId,Percentage
0,18-39,49054,4.064429
1,40-63,395362,32.758201
2,64+,761462,63.091863
3,<18,1032,0.085508


In [6]:
load_and_dump("PersonLocation")

Loading: PersonLocation


In [9]:
load_and_dump('Location')

Loading: Location


In [10]:
location = pd.read_parquet("data/Location.parquet")

In [7]:
pl = pd.read_parquet("data/PersonLocation.parquet")

In [8]:
pl.head()

Unnamed: 0,EffectiveStartDateTime,Id,LocationId,PersonId,UseConceptId
0,2018-04-11 10:38:07,27efe796-62a2-38b5-7033-b89e51dbb4c0,816e0c8c-b676-3b0c-df95-09e2f697b501,b61ce239-6cdd-1b52-3a1e-ee152787b4ad,1067556
1,2017-11-07 16:43:46,7cdcca27-7111-da5a-bb67-a605a4b40066,ca57eb86-dba8-604e-450d-e8d6c0dd8bea,ee9d93ae-aa70-a9e7-3657-94e956309c36,1067556
2,2018-05-16 10:03:34,c7adc1de-2d49-8c5a-04c8-644811ca9358,025a7d05-f47c-7570-0c7a-c251bf6a8683,67a2a7dc-02b2-8931-9a44-a99d50601ff7,1067556
3,2019-02-11 21:12:30,ccdd3cc4-78ed-6303-e76a-3b890a41f4c2,c6cee469-d891-110e-b68e-f1833be7c7f3,f52e366a-69d2-fb8b-8c34-b6ab556ba18b,1067556
4,2018-06-21 07:48:03,1bb6a1b8-4521-bb05-0693-ebe11327d751,0a0635e2-9b6c-f980-9f09-98a7bf16c067,718774a3-bab6-3cb7-6375-15f0332ccd0c,1067556


In [11]:
person_location = pl.merge(location, left_on = 'LocationId', right_on='Id', how = 'left')

In [None]:
person_location.groupby('PersonId').LocationId.count().describe()

In [6]:
sql = """select l.StateOrProvinceConceptId, count(distinct PersonId) as cnt
from PersonLocation pl join Location l on (pl.LocationId = l.Id)
where StateOrProvinceConceptId is not NULL and StateOrProvinceConceptId not in (0,1067561,1067557)
group by StateOrProvinceConceptId
order by cnt"""

location_cnt = population.load_sql_table(sql = sql)

In [9]:
location_name = location_cnt.merge(concept, left_on = 'StateOrProvinceConceptId', right_on = 'ConceptId',how = "left")
location_name[['StateOrProvinceConceptId','ConceptName','cnt']]

Unnamed: 0,StateOrProvinceConceptId,ConceptName,cnt
0,0.0,Field has not been mapped,9851
1,1067503.0,Michigan,52285
2,1067493.0,Idaho,1961
3,1067528.0,Tennessee,581
4,1067536.0,Washington,99314
5,,,25327
6,1067511.0,Nebraska,390
7,1067505.0,Missouri,25495
8,1067522.0,Pennsylvania,593
9,1067491.0,Hawaii,2699


In [None]:
#check marital status
