In [12]:
# imports
import sys
import os
import socket
import pandas as pd
import glob
import datetime

In [13]:
# read in hes dx file
dementia_dx_hes = pd.read_parquet(os.getcwd() + '/first_dementia_records_in_hes.parquet')

In [14]:
# read in primary dx files
dementia_dx_primary_clin = pd.read_parquet(os.getcwd() + '/first_dementia_records_in_primary_care.parquet')
dementia_dx_primary_test = pd.read_parquet(os.getcwd() + '/first_dementia_dx_in_primary_test.parquet')
dementia_dx_primary_ref = pd.read_parquet(os.getcwd() + '/first_dementia_dx_in_primary_referral.parquet')

In [15]:
# get e_patids from files
e_patids_hes = dementia_dx_hes['e_patid'].unique()
e_patids_clin = dementia_dx_primary_clin['e_patid'].unique()
e_patids_test = dementia_dx_primary_test['e_patid'].unique()
e_patids_ref = dementia_dx_primary_ref['e_patid'].unique()

In [16]:
# combine primary e_patids
e_patids_primary = list(set(e_patids_clin).union(set(e_patids_test)).union(set(e_patids_ref)))

In [17]:
# get first dx in any primary file for primary dx e_patids
first_dementia_code_dates_primary = []
for i in e_patids_primary:
    dates = []
    if i in e_patids_clin:
        dates.append(min(dementia_dx_primary_clin[dementia_dx_primary_clin['e_patid'] == i]['eventdate']))
    if i in e_patids_test:
        dates.append(min(dementia_dx_primary_test[dementia_dx_primary_test['e_patid'] == i]['eventdate']))
    if i in e_patids_ref:
        dates.append(min(dementia_dx_primary_ref[dementia_dx_primary_ref['e_patid'] == i]['eventdate']))
    first_dementia_code_dates_primary.append(min(dates))

In [18]:
# make df
first_dx_dates_primary = pd.DataFrame({"e_patid": e_patids_primary, "eventdate": first_dementia_code_dates_primary})

In [19]:
# get first hes dx
first_dementia_code_dates_hes = []
for i in e_patids_hes:
    first_dementia_code_dates_hes += [min(dementia_dx_hes[dementia_dx_hes['e_patid'] == i]['epistart'])]

In [20]:
# make df
first_dx_dates_hes = pd.DataFrame({"e_patid": e_patids_hes, "epistart": first_dementia_code_dates_hes})

In [21]:
# combine all dementia dx e_patids in hes and primary files
e_patids_all = list(set(e_patids_primary).union(set(e_patids_hes)))

In [22]:
# get first dx date in all files and whether it was in primary care
first_dx_dates_all = []
is_dx_in_primary = []
for i in e_patids_all:
    date_primary = None
    date_hes = None
    
    if i in e_patids_primary:
        dates = list(first_dx_dates_primary[first_dx_dates_primary['e_patid'] == i]['eventdate'])
        date_primary = dates[0]
    else:
        date_primary = pd.Timestamp(datetime.date.today())
        
    if i in e_patids_hes:
        dates = list(first_dx_dates_hes[first_dx_dates_hes['e_patid'] == i]['epistart'])
        date_hes = dates[0]
    else:
        date_hes = pd.Timestamp(datetime.date.today())
        
    first_dx_dates_all.append(min([date_primary, date_hes]))
    
    if date_primary < date_hes:
        is_dx_in_primary.append(1.0)
    else:
        is_dx_in_primary.append(0.0)

In [23]:
# make df
first_dx_all_df = pd.DataFrame({'e_patid': e_patids_all, 'first_diagnosis_date': first_dx_dates_all,
                                'first_diagnosed_in_primary_care': is_dx_in_primary})

In [24]:
first_dx_all_df.head()

Unnamed: 0,e_patid,first_diagnosis_date,first_diagnosed_in_primary_care
0,792199168,2010-11-25,1.0
1,24117254,2005-10-04,1.0
2,519045131,2004-02-11,0.0
3,63438860,1996-09-26,1.0
4,734003216,2009-08-26,1.0


In [25]:
first_dx_all_df.describe()

Unnamed: 0,e_patid,first_diagnosed_in_primary_care
count,206753.0,206753.0
mean,310912100.0,0.247551
std,254960700.0,0.431591
min,1732.0,0.0
25%,60673190.0,0.0
50%,270672500.0,0.0
75%,535296300.0,0.0
max,800003100.0,1.0


In [26]:
# save as parquet
first_dx_all_df.to_parquet(os.getcwd() + '/first_dementia_dx_all.parquet')

In [27]:
first_dx_all_df.shape[0]

206753

In [28]:
first_dx_all_df[first_dx_all_df['first_diagnosed_in_primary_care'] == 1.0].shape[0]

51182

In [29]:
first_dx_all_df[first_dx_all_df['first_diagnosed_in_primary_care'] != 1.0].shape[0]

155571

In [30]:
first_dx_dates_primary[first_dx_dates_primary['e_patid'] == 519045131]['eventdate']

Series([], Name: eventdate, dtype: datetime64[ns])

In [31]:
first_dx_dates_hes[first_dx_dates_hes['e_patid'] == 519045131]['epistart']

148632   2004-02-11
Name: epistart, dtype: datetime64[ns]

In [32]:
first_dx_dates_primary[first_dx_dates_primary['e_patid'] == 792199168]['eventdate']

0   2010-11-25
Name: eventdate, dtype: datetime64[ns]

In [33]:
first_dx_dates_hes[first_dx_dates_hes['e_patid'] == 792199168]['epistart']

185262   2011-02-03
Name: epistart, dtype: datetime64[ns]