In [1]:
# imports
import sys
import os
import socket
import pandas as pd
import glob

In [2]:
# this works for the hospital diagnosis files
def formatdatehosp(data):
    data['epiend']=data['epiend'].str.replace('/', '')
    data['epiend']=pd.to_datetime(data['epiend'], format='%d%m%Y')
    data['epistart']=data['epistart'].str.replace('/', '')
    data['epistart']=pd.to_datetime(data['epistart'], format='%d%m%Y')
    data['epistart'].fillna(data['epiend'], inplace = True)
    return data

In [3]:
# read in HES dementia ICDs file
dementia_codes = pd.read_excel('/mnt/maths/tws21/Dementia ICDs.xlsx')
dementia_codes.head()

Unnamed: 0,icd,dx
0,F00,Alzheimer's
1,F00.0,Alzheimer's
2,F00.1,Alzheimer's
3,F00.2,Alzheimer's
4,F00.9,Alzheimer's


In [4]:
# get ICD codes
dementia_icds = list(dementia_codes['icd'])

In [7]:
path = '/mnt/lustre/users/maths/tws21/seb_tmp/bsms2730/bsms2730/Linked Data/'

In [8]:
#read in and merge all the diagnosis files

diag_files = glob.glob(path + "*diag*.txt")

diag_list = []
for filename in diag_files:
    df = pd.read_csv(filename, sep="\t")
    if filename.find('hesop') != -1:
        df['is_op'] = [1.0 for i in range(df.shape[0])]
    else:
        df['is_op'] = [0.0 for i in range(df.shape[0])]
    diag_list.append(df)

diag_merged = pd.concat(diag_list, axis=0)
diag_merged = diag_merged[diag_merged['icd'].isin(dementia_icds)]
diag_merged = formatdatehosp(diag_merged)

In [9]:
diag_merged.head()

Unnamed: 0,e_patid,spno,epikey,epistart,epiend,icd,icdx,d_order,is_op,aekey,diag,diag2,diag3,diaga,diags,diag_order,admidate,discharged,icd_primary
178,100007617,21656866.0,110514700000.0,2010-10-20,2010-10-21,F00.9,-A,8.0,0.0,,,,,,,,,,
186,100007617,21656867.0,111248100000.0,2011-06-19,2011-06-19,F00.9,-A,9.0,0.0,,,,,,,,,,
189,100007617,21656866.0,110514700000.0,2010-10-20,2010-10-21,G30.9,-D,7.0,0.0,,,,,,,,,,
207,100007617,21656867.0,111248100000.0,2011-06-19,2011-06-19,G30.9,-D,8.0,0.0,,,,,,,,,,
222,100007617,21656868.0,502390700000.0,2014-03-27,2014-04-07,F01.9,-,4.0,0.0,,,,,,,,,,


In [10]:
diag_merged.shape[0]

1291481

In [11]:
#get relevant columns
dementia_diag_hes = diag_merged[['e_patid', 'icd', 'epistart', 'epiend', 'is_op']]

In [12]:
dementia_diag_hes.shape[0]

1291481

In [13]:
dementia_diag_hes.head()

Unnamed: 0,e_patid,icd,epistart,epiend,is_op
178,100007617,F00.9,2010-10-20,2010-10-21,0.0
186,100007617,F00.9,2011-06-19,2011-06-19,0.0
189,100007617,G30.9,2010-10-20,2010-10-21,0.0
207,100007617,G30.9,2011-06-19,2011-06-19,0.0
222,100007617,F01.9,2014-03-27,2014-04-07,0.0


In [14]:
# output as parquet
dementia_diag_hes.to_parquet(os.getcwd() + '/dementia_hes_diag_data.parquet')

In [15]:
# check no outpatient files
dementia_diag_hes[dementia_diag_hes['is_op'] == 1.0].shape[0]

0