In [1]:
# imports
import sys
import os
import socket
import pandas as pd
import glob
import datetime

In [2]:
# read in clean cohort file
cohort = pd.read_parquet(os.getcwd() + '/cleaned_cohort_file.parquet')

In [3]:
# read in symptoms files
symptoms_clin = pd.read_parquet(os.getcwd() + '/first_dementia_symptoms_in_primary_care.parquet')
symptoms_test = pd.read_parquet(os.getcwd() + '/first_dementia_symptoms_in_primary_test.parquet')
symptoms_ref = pd.read_parquet(os.getcwd() + '/first_dementia_symptoms_in_primary_referral.parquet')

In [4]:
cohort.head()

Unnamed: 0,e_patid,first_diagnosis_date,first_diagnosed_in_primary_care,gender,age_at_dx,e_pracid,region,uts,e2015_imd_5,dx_minus6m,dx_minus12m,dx_minus18m,dx_minus24m,dx_minus3y,dx_minus5y
0,24117254,2005-10-04,1.0,1.0,70.0,54.0,10.0,2000-01-12,4.0,2005-04-04,2004-10-04,2004-04-04,2003-10-05,2002-10-05,2000-10-04
1,519045131,2004-02-11,0.0,1.0,74.0,131.0,2.0,1992-12-21,5.0,2003-08-12,2003-02-11,2002-08-12,2002-02-11,2001-02-11,1999-02-11
3,734003216,2009-08-26,1.0,2.0,93.0,216.0,10.0,2003-02-14,4.0,2009-02-24,2008-08-26,2008-02-25,2007-08-27,2006-08-27,2004-08-26
7,10485798,2014-01-02,0.0,1.0,89.0,98.0,6.0,1990-01-03,4.0,2013-07-03,2013-01-02,2012-07-03,2012-01-03,2011-01-03,2009-01-02
8,630194220,2003-05-15,0.0,2.0,74.0,220.0,7.0,1997-09-23,5.0,2002-11-13,2002-05-15,2001-11-13,2001-05-15,2000-05-15,1998-05-15


In [5]:
# limit to files where first dx was in primary care
primary_dx = cohort[cohort['first_diagnosed_in_primary_care'] == 1.0]

In [6]:
# get unique e_patids
primary_e_patids = list(primary_dx['e_patid'])

In [7]:
# limit symptom files to relevant e_patids
prim_sym_clin = symptoms_clin[symptoms_clin['e_patid'].isin(primary_e_patids)]
prim_sym_test = symptoms_test[symptoms_test['e_patid'].isin(primary_e_patids)]
prim_sym_ref = symptoms_ref[symptoms_ref['e_patid'].isin(primary_e_patids)]

In [8]:
prim_sym_clin.shape[0]

10474

In [9]:
prim_sym_test.shape[0]

0

In [10]:
prim_sym_ref.shape[0]

5488

In [11]:
primary_dx.shape[0]

11966

In [12]:
# get unique e_patids for each file
e_patids_clin = prim_sym_clin['e_patid'].unique()
e_patids_test = prim_sym_test['e_patid'].unique()
e_patids_ref = prim_sym_ref['e_patid'].unique()

In [13]:
# get first dementia symptom for each unique patient selected
first_dementia_symptoms_primary = []
for i in primary_e_patids:
    dates = list(cohort[cohort['e_patid'] == i]['first_diagnosis_date'])
    if i in e_patids_clin:
        dates.append(min(prim_sym_clin[prim_sym_clin['e_patid'] == i]['eventdate']))
    if i in e_patids_test:
        dates.append(min(prim_sym_test[prim_sym_test['e_patid'] == i]['eventdate']))
    if i in e_patids_ref:
        dates.append(min(prim_sym_ref[prim_sym_ref['e_patid'] == i]['eventdate']))
    first_dementia_symptoms_primary.append(min(dates))

In [14]:
# create first symptom date column
primary_dx['first_symptom_date'] = first_dementia_symptoms_primary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
primary_dx.head()

Unnamed: 0,e_patid,first_diagnosis_date,first_diagnosed_in_primary_care,gender,age_at_dx,e_pracid,region,uts,e2015_imd_5,dx_minus6m,dx_minus12m,dx_minus18m,dx_minus24m,dx_minus3y,dx_minus5y,first_symptom_date
0,24117254,2005-10-04,1.0,1.0,70.0,54.0,10.0,2000-01-12,4.0,2005-04-04,2004-10-04,2004-04-04,2003-10-05,2002-10-05,2000-10-04,2005-08-08
3,734003216,2009-08-26,1.0,2.0,93.0,216.0,10.0,2003-02-14,4.0,2009-02-24,2008-08-26,2008-02-25,2007-08-27,2006-08-27,2004-08-26,2009-03-23
9,76546093,1994-12-15,1.0,1.0,66.0,93.0,3.0,1989-05-01,5.0,1994-06-15,1993-12-15,1993-06-15,1992-12-15,1991-12-16,1989-12-15,1994-12-15
11,395313206,2011-11-14,1.0,2.0,81.0,206.0,8.0,1998-12-22,3.0,2011-05-15,2010-11-14,2010-05-15,2009-11-14,2008-11-14,2006-11-14,2011-11-14
19,728236135,2002-03-02,1.0,1.0,67.0,135.0,8.0,1996-01-04,4.0,2001-08-31,2001-03-02,2000-08-31,2000-03-02,1999-03-03,1997-03-02,2002-01-08


In [16]:
# create time from symptom to diagnosis column
primary_dx['time_from_first_symptom_to_dx'] = primary_dx['first_diagnosis_date'] - primary_dx['first_symptom_date']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
primary_dx.head()

Unnamed: 0,e_patid,first_diagnosis_date,first_diagnosed_in_primary_care,gender,age_at_dx,e_pracid,region,uts,e2015_imd_5,dx_minus6m,dx_minus12m,dx_minus18m,dx_minus24m,dx_minus3y,dx_minus5y,first_symptom_date,time_from_first_symptom_to_dx
0,24117254,2005-10-04,1.0,1.0,70.0,54.0,10.0,2000-01-12,4.0,2005-04-04,2004-10-04,2004-04-04,2003-10-05,2002-10-05,2000-10-04,2005-08-08,57 days
3,734003216,2009-08-26,1.0,2.0,93.0,216.0,10.0,2003-02-14,4.0,2009-02-24,2008-08-26,2008-02-25,2007-08-27,2006-08-27,2004-08-26,2009-03-23,156 days
9,76546093,1994-12-15,1.0,1.0,66.0,93.0,3.0,1989-05-01,5.0,1994-06-15,1993-12-15,1993-06-15,1992-12-15,1991-12-16,1989-12-15,1994-12-15,0 days
11,395313206,2011-11-14,1.0,2.0,81.0,206.0,8.0,1998-12-22,3.0,2011-05-15,2010-11-14,2010-05-15,2009-11-14,2008-11-14,2006-11-14,2011-11-14,0 days
19,728236135,2002-03-02,1.0,1.0,67.0,135.0,8.0,1996-01-04,4.0,2001-08-31,2001-03-02,2000-08-31,2000-03-02,1999-03-03,1997-03-02,2002-01-08,53 days


In [18]:
primary_dx.describe()

Unnamed: 0,e_patid,first_diagnosed_in_primary_care,gender,age_at_dx,e_pracid,region,e2015_imd_5,time_from_first_symptom_to_dx
count,11966.0,11966.0,11966.0,11966.0,11966.0,11966.0,11693.0,11966
mean,277352400.0,1.0,1.633211,78.418519,135.799682,6.33637,3.267168,506 days 16:15:29.032258064
std,255586500.0,0.0,0.481948,8.668188,75.1609,2.536472,1.384737,1250 days 02:39:41.099861328
min,8370.0,1.0,1.0,42.0,1.0,1.0,1.0,0 days 00:00:00
25%,47418960.0,1.0,1.0,73.0,69.0,5.0,2.0,0 days 00:00:00
50%,191617200.0,1.0,2.0,79.0,138.0,7.0,3.0,39 days 00:00:00
75%,500206000.0,1.0,2.0,85.0,206.0,8.0,4.0,463 days 00:00:00
max,799767200.0,1.0,2.0,108.0,257.0,10.0,5.0,29421 days 00:00:00


In [19]:
# check quantiles for time to diagnosis
primary_dx['time_from_first_symptom_to_dx'].quantile(q = [0.1 * i for i in range(1,11)])

0.1       0 days
0.2       0 days
0.3       0 days
0.4       0 days
0.5      39 days
0.6     124 days
0.7     308 days
0.8     672 days
0.9    1450 days
1.0   29421 days
Name: time_from_first_symptom_to_dx, dtype: timedelta64[ns]

In [20]:
primary_dx['time_from_first_symptom_to_dx'].quantile(q = 2/3)

Timedelta('220 days 15:59:59.999999868')

In [21]:
late_threshold = primary_dx['time_from_first_symptom_to_dx'].quantile(q = 2/3)

In [22]:
timely_threshold = primary_dx['time_from_first_symptom_to_dx'].quantile(q = 1/3)
timely_threshold

Timedelta('0 days 00:00:00')

In [23]:
# set late diagnosis by time after first symptom to in highest tertile
late_by_symptoms = primary_dx[primary_dx['time_from_first_symptom_to_dx'] > late_threshold]

In [24]:
late_e_patids = list(late_by_symptoms['e_patid'])

In [25]:
late_by_symptoms.shape[0]

3989

In [26]:
cohort_e_patids = cohort['e_patid']

In [27]:
# add late dx column including all patients specified as late above plus any first diagnosed in HES
cohort_out = cohort.copy(deep = True)
late_dx = []
for i in cohort_e_patids:
    if list(cohort[cohort['e_patid'] == i]['first_diagnosed_in_primary_care']) != [1.0]:
        late_dx.append(1.0)
    elif i in late_e_patids:
        late_dx.append(1.0)
    else:
        late_dx.append(0.0)
cohort_out['late_dx'] = late_dx

In [28]:
cohort_out.head()

Unnamed: 0,e_patid,first_diagnosis_date,first_diagnosed_in_primary_care,gender,age_at_dx,e_pracid,region,uts,e2015_imd_5,dx_minus6m,dx_minus12m,dx_minus18m,dx_minus24m,dx_minus3y,dx_minus5y,late_dx
0,24117254,2005-10-04,1.0,1.0,70.0,54.0,10.0,2000-01-12,4.0,2005-04-04,2004-10-04,2004-04-04,2003-10-05,2002-10-05,2000-10-04,0.0
1,519045131,2004-02-11,0.0,1.0,74.0,131.0,2.0,1992-12-21,5.0,2003-08-12,2003-02-11,2002-08-12,2002-02-11,2001-02-11,1999-02-11,1.0
3,734003216,2009-08-26,1.0,2.0,93.0,216.0,10.0,2003-02-14,4.0,2009-02-24,2008-08-26,2008-02-25,2007-08-27,2006-08-27,2004-08-26,0.0
7,10485798,2014-01-02,0.0,1.0,89.0,98.0,6.0,1990-01-03,4.0,2013-07-03,2013-01-02,2012-07-03,2012-01-03,2011-01-03,2009-01-02,1.0
8,630194220,2003-05-15,0.0,2.0,74.0,220.0,7.0,1997-09-23,5.0,2002-11-13,2002-05-15,2001-11-13,2001-05-15,2000-05-15,1998-05-15,1.0


In [29]:
cohort_out[cohort_out['first_diagnosed_in_primary_care'] == 1.0]['late_dx'].describe()

count    11966.000000
mean         0.333361
std          0.471434
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: late_dx, dtype: float64

In [30]:
# save to parquet
cohort_out.to_parquet(os.getcwd() + '/late_by_symptom_date_cohort_file.parquet')