In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Paths defined 
data_path = Path('/Users/sumanth/personal_project/data/')
output_path = Path('/Users/sumanth/personal_project/outputs/pc_cancer_post_20')
output_path.mkdir(parents=True, exist_ok=True)

In [3]:
# Patients and conditions files

patients    = pd.read_csv(data_path/'patients.csv', engine='python', on_bad_lines='skip')
conditions  = pd.read_csv(data_path/'conditions.csv', engine='python', on_bad_lines='skip')
conditions['START'] = pd.to_datetime(conditions['START'], errors='coerce')

In [4]:
# Possible cancer terms to filter the description column on 

cancer_terms = [ "malignant neoplasm","carcinoma","cancer","sarcoma","melanoma", "lymphoma","leukemia","myeloma","metastasis","in situ"]

In [5]:
# Creating all the filters (cancer patients with first dx after 2020)

filter_cancer_tf = conditions['DESCRIPTION'].str.contains("|".join(cancer_terms), case=False, na=False)
cancer_conditions = conditions[filter_cancer_tf]

first_dx = (cancer_conditions.groupby('PATIENT')['START'].min().reset_index(name='FIRST_DIAGNOSIS'))
first_dx_after_2020 = first_dx[first_dx['FIRST_DIAGNOSIS'] >= '2020-01-01']

In [6]:
#Final patient cohort and checking the count

final_pat_cohort = patients.merge(first_dx_after_2020, left_on='Id', right_on='PATIENT', how='inner')
print("Cancer patients after 2020:", final_pat_cohort.shape[0])

Cancer patients after 2020: 275


In [7]:
#Saving the patient IDs to the output folder

final_pat_cohort_ids = final_pat_cohort[["PATIENT"]].drop_duplicates().rename(columns = {"PATIENT":"pat_id"})
final_pat_cohort_ids.to_csv(output_path / "pc_ids_cancer_post_2020.csv", index = False)

In [8]:
final_pat_cohort_ids

Unnamed: 0,pat_id
0,b1dae81e-faef-19a6-7699-559279927773
1,553b64e6-a7bc-a9cd-1650-2f599d9a7528
2,6c8e53d0-e87f-fabf-e294-017bfa736121
3,d5493a8b-548c-70bd-8805-baeb010baedf
4,431ecf25-cf87-33ce-a29a-14fb0a3c320f
...,...
270,2925ff42-8bcd-555d-f745-ef79e06438b7
271,5a0235bb-4053-0fd8-6bfb-a9ba93ebacaf
272,449cb9c4-0627-5c4f-6486-3ab4186c8c6e
273,ab4b974f-9b2d-2354-fcf0-f6b22f92c99d
