In [1]:
%pwd

'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix\\notebooks'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\PrinciaFernandes\\Mresult\\Phenomix'

In [1]:
import os
import json

def get_detail_files(dir:str)->list:
    detail_files = []
    websites = ['CPRD','HDRUK','OHDSI','PHEKB','SENTINEL']
    for files in os.listdir(dir):
        if files in websites:
            file_path = os.path.join(dir,files)
            for detail in os.listdir(file_path):
                if '_detail.json' in detail:
                    with open(os.path.join(file_path,detail),'r') as f:
                        file_content = f.read()
                    detail_files.extend(json.loads(file_content))
    return detail_files


def transform_detail(detail_files):
    new_detail = []
    for dict in detail_files:
        d = {}
        if 'Outcome' in dict.keys(): 
            d['Name'] = dict['Outcome']
        elif 'Disease' in dict.keys():
            d['Name'] = dict['Disease']
        elif "Cohort_name" in dict.keys():
            d['Name'] = dict["Cohort_name"]
        else:
            d['Name'] = dict['Name']
        if 'PID' in dict.keys():
            d['PID'] = dict['PID']
        new_detail.append(d)
    return new_detail


def get_pid_key(pid):
    keys = ['Phenotypes','hdruk_PID','phekb_PID','cprd_PID','Sentinel_PID','ohdsi_PID']
    key_map = {'H':keys[1],'P':keys[2],'C':keys[3],'S':keys[4],'O':keys[5]}
    init_pid = pid[0]
    if init_pid in key_map:
        return key_map[init_pid]


def create_masterlist(combined_detail):
    masterlist = []
    name_list = []
    keys = ['Phenotypes','hdruk_PID','phekb_PID','cprd_PID','Sentinel_PID','ohdsi_PID']

    for detail in combined_detail:
        if detail['Name'] not in name_list:
            name_list.append(detail['Name'])
            d = {k:None for k in keys}
            d[keys[0]] = detail['Name']
            pid_key = get_pid_key(detail['PID'])
            d[pid_key] = detail['PID']
            masterlist.append(d)
        else:

            pid_key = get_pid_key(detail['PID'])
            for d in masterlist:
                if d[keys[0]] == detail['Name']:
                    if not d[pid_key] is None:
                        d[pid_key] = d[pid_key] + f', {detail['PID']}' 
                    else:
                        d[pid_key] = detail['PID']
    sorted_masterlist = sorted(masterlist,key = lambda x:x['Phenotypes'])
    id = 0
    for d in sorted_masterlist:
        id += 1
        
        v1 = 'H' if d['hdruk_PID'] is not None else 'X'
        v2 = 'P' if d['phekb_PID'] is not None else 'X'
        v3 = 'C' if d['cprd_PID'] is not None else 'X'
        v4 = 'S' if d['Sentinel_PID'] is not None else 'X'
        v5 = 'O' if d['ohdsi_PID'] is not None else 'X'

        d['id'] = f'{v1}{v2}{v3}{v4}{v5}{id:04d}'
    
    return sorted_masterlist


In [6]:
from src.config import PROCESSED_DIR


In [7]:

def main():
    detail_files = get_detail_files(PROCESSED_DIR)
    # combined_detail = transform_detail(detail_files)
    # masterlist = create_masterlist(combined_detail)

    # with open('masterlist.json','w') as file:
    #     json.dump(masterlist,file,indent=4)

    # print("Masterlist saved successfully")
    return detail_files
f = main()

In [8]:
f

[{'Disease': 'Abdominal Aortic Aneurysm', 'Disease_num': 1, 'PID': 'CP000001'},
 {'Disease': 'Abdominal Hernia', 'Disease_num': 2, 'PID': 'CP000002'},
 {'Phenotype_id': 'PH1',
  'Phenotype_version_id': 2,
  'Name': 'COVID-19 infection',
  'Defination': 'We ascertained people with a confirmed or suspected Covid-19 diagnosis as follows:\n\n\n1) a positive PCR or antigen test from the Covid-19 laboratory test data, with specimen date on or before 31 October 2020; or\n\n\n2) a Covid-19 diagnosis SNOMED-CT concept code appearing in the primary care data, with event date on or before 31 October; or\n\n\n3) a diagnosis ICD-10 code appearing in the hospital episodes (main or secondary diagnostic code position in the admitted patient care component of the hospital episode statistics), with admission date on or before 31 October or\n\n\n4) death registration with a mention (as underlying on contributing cause) of a diagnosis ICD-10 code , with death registration date on or before 31 October 2020

In [9]:
combined_detail = transform_detail(f)


In [10]:
combined_detail

[{'Name': 'Abdominal Aortic Aneurysm', 'PID': 'CP000001'},
 {'Name': 'Abdominal Hernia', 'PID': 'CP000002'},
 {'Name': 'COVID-19 infection', 'PID': 'HP000001'},
 {'Name': 'Heart Rate', 'PID': 'HP000002'},
 {'Name': 'COPD', 'PID': 'HP000003'},
 {'Name': 'Asthma', 'PID': 'HP000004'},
 {'Name': 'Bronchiestasis', 'PID': 'HP000005'},
 {'Name': 'Cystic Fibrosis', 'PID': 'HP000006'},
 {'Name': 'Pneumonia', 'PID': 'HP000007'},
 {'Name': 'Blood Pressure', 'PID': 'HP000008'},
 {'Name': 'Body Mass Index', 'PID': 'HP000009'},
 {'Name': 'Marital status', 'PID': 'HP000010'},
 {'Name': 'Anaphylaxis', 'PID': 'HP000011'},
 {'Name': 'Anxiety', 'PID': 'HP000012'},
 {'Name': 'Hearing loss', 'PID': 'HP000013'},
 {'Name': 'Depression', 'PID': 'HP000014'},
 {'Name': 'Self Harm', 'PID': 'HP000015'},
 {'Name': 'Cardiovascular Disease', 'PID': 'HP000016'},
 {'Name': 'Cardiovascular Disease (Psoriasis Association Study with CVD)',
  'PID': 'HP000017'},
 {'Name': 'Psoriasis', 'PID': 'HP000018'},
 {'Name': 'Diabet