##### Project: Opioid Exposed Infant Covariates
##### Investigator: Stephen Patrick, Sarah Loch
##### Programmers: Sander Su, Chris Guardo
##### Date Created: 01/17/23
##### Last Modified: 09/30/25

#### Notes:
This notebook depends on the gestational age notebook in the pipeline and pulls all HIV and HCV related ICD codes from a patient’s medical records around the pregnancy period 


In [0]:
%run "../Project_modules"

##### Variable: HIV-positive
##### search the information about 2 parts:
##### ICD code: Sheet 10 - HCV and HIV 
##### Cohort: Moms in phenotype


In [0]:
phenotype_table_location = " ***Insert file location*** "
phenotype_table=spark.sql(f"SELECT * FROM {phenotype_table_location}")
sheet_name="phenotyping.mprint_hcv_hiv_icd"

##### hcv or hiv positive/present/reactive of mprint project2 maternal cohort

In [0]:
#Explore the ICD codes in procedure_occurrence, observation, condition_occurrence already!

sql=f"""
    select a.*,b.description,b.type from 
    (select person_id,observation_source_value as code, observation_date as code_date,
    observation_datetime as   code_datetime,visit_occurrence_id,'observation' as source_table
    from {obs_table} where observation_source_value in (select ICD9_ICD10 from {sheet_name})
    
    union
    
    select person_id,condition_source_value as code, condition_start_date as code_date,
    condition_start_datetime as code_datetime,visit_occurrence_id,'condition' as source_table 
    from {cond_table} where condition_source_value in (select ICD9_ICD10 from {sheet_name})) as a
    left join {sheet_name} b
    on a.code = b.ICD9_ICD10;
    """
hcv_hiv_icd = spark.sql(sql)
hcv_hiv_icd.createOrReplaceTempView("hcv_hiv_icd")

##### Adding time window: From start_gestation_date to 'birth date + 30 days'

In [0]:
sql="""
    select * from hcv_hiv_icd a
    inner join  (select mom_person_id, baby_person_id,baby_dob,start_gestation_date,
    length_of_gestation from global_temp.ega_w33_or_uncertain_gestation_date) b
    on a.person_id = b.mom_person_id
    where code_date >= start_gestation_date and code_date <  date_add(baby_dob, 30);
   """

hcv_hiv_icd_prenatal = spark.sql(sql)
hcv_hiv_icd_prenatal.createOrReplaceGlobalTempView("hcv_hiv_icd_prenatal")

##### check the visit type by having the visit records

In [0]:
sql=f"""
    select a.person_id,a.mom_person_id,a.baby_person_id,baby_dob,a.code,a.code_date,a.description,
    a.type,  b.visit_occurrence_id,b.visit_concept_id,b.visit_start_date,b.visit_end_date 
    from global_temp.hcv_hiv_icd_prenatal a
    left join {visit_table} b
    using (person_id,visit_occurrence_id)
   """

hcv_hiv_icd_prenatal_visitinfo = spark.sql(sql)
hcv_hiv_icd_prenatal_visitinfo.createOrReplaceTempView("hcv_hiv_icd_prenatal_visitinfo")

##### HCV and HIV cohorts

In [0]:
sql="""
       select * from hcv_hiv_icd_prenatal_visitinfo where type = 'HIV'; 
    """
icd_prenatal_visit_hiv= spark.sql(sql)
icd_prenatal_visit_hiv.name='icd_prenatal_visit_hiv'
register_parquet_global_view(icd_prenatal_visit_hiv)

In [0]:
sql="""
       select count(*) as total,count(distinct person_id) as unique_mom from global_temp.icd_prenatal_visit_hiv; 
    """
inspect_df= spark.sql(sql)
inspect_df.display()

In [0]:
sql="""
       select * from hcv_hiv_icd_prenatal_visitinfo where type = 'HCV'; 
    """
icd_prenatal_visit_hcv = spark.sql(sql)
icd_prenatal_visit_hcv.name='icd_prenatal_visit_hcv'
register_parquet_global_view(icd_prenatal_visit_hcv)

In [0]:
sql="""
       select count(*) as total,count(distinct person_id) as unique_mom from global_temp.icd_prenatal_visit_hcv; 
    """
inspect_df= spark.sql(sql)
inspect_df.display()

##### HCV: 1IP ICD or 2OP ICD
##### At least once inpatient code.  the visit type is '9201' which is inpatient
##### At least twice outpatient code.  the visit type is '9202' which is outpatient

In [0]:
def ip_op_visit_count_df(visit_df_name,visit_concept_id,visit_count):
    
   sql=f"""
          select mom_person_id,baby_person_id from (select distinct * from global_temp.{visit_df_name} where 
          visit_concept_id = '{visit_concept_id}') as a 
          group by a.mom_person_id,a.baby_person_id 
          having count(*) > {visit_count}
       """
    
   df= spark.sql(sql)
   return df

In [0]:
### HCV
icd_prenatal_visit_hcv_ip1= ip_op_visit_count_df("icd_prenatal_visit_hcv","9201","0")

icd_prenatal_visit_hcv_op2= ip_op_visit_count_df("icd_prenatal_visit_hcv","9202","1")

hcv_ip1_op2=union_dataframes([icd_prenatal_visit_hcv_ip1,icd_prenatal_visit_hcv_op2])

### HIV
icd_prenatal_visit_hiv_ip1= ip_op_visit_count_df("icd_prenatal_visit_hiv","9201","0")
icd_prenatal_visit_hiv_op2= ip_op_visit_count_df("icd_prenatal_visit_hiv","9202","1")
hiv_ip1_op2=union_dataframes([icd_prenatal_visit_hiv_ip1,icd_prenatal_visit_hiv_op2])

hcv_ip1_op2.createOrReplaceTempView("hcv_ip1_op2") 
hiv_ip1_op2.createOrReplaceTempView("hiv_ip1_op2")

##### Create the view of Moms  without visit_occurrence_id and did not in ip1 or op2 group

In [0]:
def mom_no_visit_occurrence_id_df(visit_df,ip1_op2_df):
  
   sql=f"""
       select a.mom_person_id,a.baby_person_id from 
       (select mom_person_id,baby_person_id from global_temp.{visit_df} where visit_occurrence_id is null) a 
       left join ({ip1_op2_df}) b
       on a.baby_person_id=b.baby_person_id
       where b.mom_person_id is null
      """
   df = spark.sql(sql)
   return df

In [0]:
mom_no_visit_occurrence_id_hcv=mom_no_visit_occurrence_id_df("icd_prenatal_visit_hcv","hcv_ip1_op2")
mom_no_visit_occurrence_id_hiv=mom_no_visit_occurrence_id_df("icd_prenatal_visit_hcv","hiv_ip1_op2")

mom_no_visit_occurrence_id_hcv.createOrReplaceTempView("mom_no_visit_occurrence_id_hcv") 
mom_no_visit_occurrence_id_hiv.createOrReplaceTempView("mom_no_visit_occurrence_id_hiv") 

In [0]:
df_inspection("mom_no_visit_occurrence_id_hcv","all")
df_inspection("mom_no_visit_occurrence_id_hiv","all")

##### create view that there is no visit_occurrence_id information 

In [0]:
def get_no_visit_list(hcv_hiv_df):
  
    sql=f"""
           select a.* from hcv_hiv_icd_prenatal_visitinfo  a
           inner join {hcv_hiv_df} b
           on a.mom_person_id = b.mom_person_id and a.baby_person_id = b.baby_person_id;
        """

    df= spark.sql(sql)
    return df

In [0]:
icd_prenatal_hcv_no_visit_occurrence_id=get_no_visit_list("mom_no_visit_occurrence_id_hcv")
icd_prenatal_hiv_no_visit_occurrence_id=get_no_visit_list("mom_no_visit_occurrence_id_hiv")

icd_prenatal_hcv_no_visit_occurrence_id.createOrReplaceTempView("icd_prenatal_hcv_no_visit_occurrence_id")  
icd_prenatal_hiv_no_visit_occurrence_id.createOrReplaceTempView("icd_prenatal_hiv_no_visit_occurrence_id")

In [0]:
df_inspection("icd_prenatal_hcv_no_visit_occurrence_id","all")

df_inspection("icd_prenatal_hiv_no_visit_occurrence_id","all")

#### Condition 1:  had visit_end_date
- if the code is inpatient or outpatient by checking the CODE_DATE: visit_start_date <= code_date and code_date < visit_end_date

#### Condition 2:  did not have visit_end_date, and visit concept is '9201' 
- if did not have visit end date: check visit_start_date <= code_date and code_date < visit_start_date + 30;

#### condition 3:  did not have visit_end_date, and visit concept is '9202'
- if did not have visit end date: check visit_start_date <= code_date and code_date < visit_start_date + 3;

#### Normal condition: had visit_occurrence_id

In [0]:
def format_cond_sql(visit_concept_id,hcv_hiv_df_name,buffer_day):
    
    sql=f"""
           select  distinct a.person_id,a.mom_person_id,a.baby_person_id,baby_dob,a.code,a.code_date,
           a.description,a.type, b.visit_occurrence_id,b.visit_concept_id,b.visit_start_date,b.visit_end_date
    
           from {hcv_hiv_df_name} a
           left join {visit_table} b 
           on a.mom_person_id = b.person_id and b.visit_end_date is null
           where b.visit_concept_id = {visit_concept_id}
           and b.visit_start_date <= code_date and code_date < date_add(b.visit_start_date, {buffer_day});
        """
    return sql

def combine_no_visit_cond(hcv_hiv_df_name):
    
    #cond1
    sql=f"""
        select distinct a.person_id,a.mom_person_id,a.baby_person_id,baby_dob,a.code,a.code_date,
        a.description,a.type,  b.visit_occurrence_id,b.visit_concept_id,b.visit_start_date,b.visit_end_date 
        from {hcv_hiv_df_name} a
        inner join {visit_table} b 
        on a.mom_person_id = b.person_id and b.visit_end_date is not null
        and b.visit_start_date <= code_date and code_date < b.visit_end_date;
       """

    cond1 = spark.sql(sql)
    
    #cond2.
    cond2 = spark.sql(format_cond_sql('9201',hcv_hiv_df_name,"30"))
   
    #cond3
    cond3 = spark.sql(format_cond_sql('9202',hcv_hiv_df_name,"3"))
    
    sql="""
           select * from global_temp.icd_prenatal_visit_hcv;
        """
    normal_cond = spark.sql(sql)
    all_cond_df=union_dataframes([normal_cond,cond1,cond2,cond3]).distinct()

    return all_cond_df

In [0]:
icd_prenatal_hcv_update= combine_no_visit_cond("icd_prenatal_hcv_no_visit_occurrence_id")
icd_prenatal_hiv_update= combine_no_visit_cond("icd_prenatal_hiv_no_visit_occurrence_id")

icd_prenatal_hcv_update.createOrReplaceGlobalTempView("icd_prenatal_hcv_update")  
icd_prenatal_hiv_update.createOrReplaceGlobalTempView("icd_prenatal_hiv_update")

##### After combining all conditions, check again if the patient got at least once inpatient ICD
##### After combining all conditions, check again if the patient got at least twice outpatient ICD

In [0]:
icd_prenatal_visit_hcv_ip1_update=ip_op_visit_count_df("icd_prenatal_hcv_update","9201","0")
icd_prenatal_visit_hiv_ip1_update=ip_op_visit_count_df("icd_prenatal_hiv_update","9201","0")
icd_prenatal_visit_hcv_op2_update=ip_op_visit_count_df("icd_prenatal_hcv_update","9202","1")
icd_prenatal_visit_hiv_op2_update=ip_op_visit_count_df("icd_prenatal_hiv_update","9202","1")

icd_prenatal_visit_hcv_ip1_update.name='icd_prenatal_visit_hcv_ip1_update'
register_parquet_global_view(icd_prenatal_visit_hcv_ip1_update)

icd_prenatal_visit_hiv_ip1_update.name='icd_prenatal_visit_hiv_ip1_update'
register_parquet_global_view(icd_prenatal_visit_hiv_ip1_update)

icd_prenatal_visit_hcv_op2_update.name='icd_prenatal_visit_hcv_op2_update'
register_parquet_global_view(icd_prenatal_visit_hcv_op2_update)

icd_prenatal_visit_hiv_op2_update.name='icd_prenatal_visit_hiv_op2_update'
register_parquet_global_view(icd_prenatal_visit_hiv_op2_update)

In [0]:
df_inspection("global_temp.icd_prenatal_visit_hiv_ip1_update","all")

df_inspection("global_temp.icd_prenatal_visit_hiv_op2_update","all")

##### Combine ip1 and op2 for HCV and HIV

In [0]:
icd_prenatal_visit_hiv_ip1op2_update=union_dataframes([icd_prenatal_visit_hiv_ip1_update,icd_prenatal_visit_hiv_op2_update]).distinct()
icd_prenatal_visit_hiv_ip1op2_update.name='icd_prenatal_visit_hiv_ip1op2_update'
register_parquet_global_view(icd_prenatal_visit_hiv_ip1op2_update)

icd_prenatal_visit_hcv_ip1op2_update=union_dataframes([icd_prenatal_visit_hcv_ip1_update,icd_prenatal_visit_hcv_op2_update]).distinct()
icd_prenatal_visit_hcv_ip1op2_update.name='icd_prenatal_visit_hcv_ip1op2_update'
register_parquet_global_view(icd_prenatal_visit_hcv_ip1op2_update)

In [0]:
df_inspection("global_temp.icd_prenatal_visit_hiv_ip1op2_update","all")

### Save Output for future use

In [0]:
icd_prenatal_visit_hcv_ip1op2_update.write.mode("overwrite").saveAsTable(f"covariate_output.icd_prenatal_visit_hcv_ip1op2_update")
icd_prenatal_visit_hiv_ip1op2_update.write.mode("overwrite").saveAsTable(f"covariate_output.icd_prenatal_visit_hiv_ip1op2_update")