In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import TrialPathfinder as tp

# Trial PathFinder

## Load Data Tables

TrialPathfinder reads tables in Pandas dataframe structure (pd.dataframe) as default. The date information should be read as datetime (use function pd.to_datetime to convert if not).

**1. Features**:
- <font color=darkblue>*Patient ID*</font>
- Treatment Information
    - <font color=darkblue>*Drug name*</font>.
    - <font color=darkblue>*Start date*</font>.
    - <font color=darkblue>*Date of outcome*</font>. For example, if overall survival (OS) is used as metric, the date of outcome is the date of death. If progression-free survival (PFS) is used as metric, the date of outcome is the date of progression.
    - <font color=darkblue>*Date of last visit*</font>. The patient's last record date of visit, used for censoring.
- <font color=darkblue>*Covariates (optional)*</font>: adjusted to emulate the blind assignment, used by Inverse probability of treatment weighting (IPTW) or propensity score matching (PSM). Some examples: age, gender, composite race/ethnicity, histology, smoking status, staging, ECOG, and biomarkers status.

**2. Tables used by eligibility criteria.**
- Use the same Patient ID as the features table.

In [2]:
### 1. Features ###
features = pd.read_csv('data/features.csv')

### 2. Tables used by eligibility criteria. ###
demographics = pd.read_csv('data/demographics.csv')
lab = pd.read_csv('data/lab.csv')

# Process date information
for table in [lab, features, demographics]:
    for col in table.columns:
        if 'Date' in col:
            table[col] = pd.to_datetime(table[col])

In [3]:
features.head()

Unnamed: 0,PatientID,DrugName,StartDate,OutcomeDate,LastVisitDate,Gender,Race,ECOG
0,679436,drug A,2017-01-25,2018-02-18,2018-05-29,F,Black,2
1,607096,drug B,2017-01-30,2018-12-17,2019-03-01,F,Black,1
2,690751,drug B,2017-04-22,2017-10-03,2017-10-12,M,White,1
3,887951,drug B,2015-04-20,2016-01-08,2016-06-28,M,White,2
4,667297,drug A,2018-08-06,NaT,2020-09-11,M,White,1


In [4]:
demographics.head()

Unnamed: 0,PatientID,ECOG,BirthDate,Histology
0,679436,2,1984-02-03,Squamous cell carcinoma
1,607096,1,1999-02-04,Non-squamous cell carcinoma
2,690751,1,1979-05-02,Squamous cell carcinoma
3,887951,2,1955-05-05,Non-squamous cell carcinoma
4,667297,1,1953-08-22,Squamous cell carcinoma


In [5]:
lab.head()

Unnamed: 0,PatientID,LabName,LabValue,TestDate
0,57562,Platelet count,88.0,2017-06-03
1,930338,Total bilirubin,0.86,2018-08-11
2,529505,Total bilirubin,1.97,2016-11-08
3,933296,Total bilirubin,0.91,2017-11-12
4,482470,Platelet count,132.0,2017-09-15


## Stadards of encoding eligibility criteria

We built a computational workflow to encode the description of eligibility criteria in the protocols into standardized instructions which can be parsed by Trial Pathfinder for cohort selection use. 

**1. Basic logic.**

- Name of the criteria is written in the first row.
- A new statement starts with “#inclusion” or “#exclusion” to indicate the criterion’s type. Whether to include patients who have missing entries in the criteria: “(missing include)” or “(missing exclude)”. The default choice is including patients with missing entries. 
- Data name format: “Table[‘featurename’]”. For example, “demographics[‘birthdate’]” denotes column date of birth in table demographics.
- Equation: ==, !=, <, <=, >, >=. 
- Logic: AND, OR.
- Other operations: MIN, MAX, ABS.
- Time is encoded as “DAYS(80)”: 80 days; “MONTHS(4)”: 4 months; “YEARS(3)”: 3 years.

---
*Example: criteria "Age" - include patients more than 18 years old when they received the treatment.*

> Age \
\#Inclusion \
features['StartDate'] >= demographics['BirthDate'] + @YEARS(18> 

---

**2. Complex rule with hierachy.**
- Each row is operated in sequential order
    - The tables are prepared before the last row. 
    - The patients are selected at the last row. 

---
*Example: criteria "Platelets" - include patients whose platelet count ≥ 100 x 10^3/μL*. \
To encode this criterion, we follow the procedure: 
1. Prepare the lab table: 
    1. Pick the lab tests for platelet count
    2. The lab test date should be within a -28 to +0 window around the treatment start date
    3. Use the record closest to the treatment start date to do selection.
2. Select patients: lab value larger than 100 x 10^3/μL.
> Platelets \
\#Inclusion \
(lab['LabName'] == 'Platelet count') \
(lab['TestDate'] >= features['StartDate'] - @DAYS(28) ) AND (lab['TestDate'] <= features['StartDate']) \
MIN(ABS(lab['TestDate'] - features['StartDate'])) \
lab['LabValue'] >= 100 
---

In [6]:
criteria = pd.read_csv('data/criteria.csv', header=None).values.reshape(-1)
for rule in criteria:
    print(rule, '\n')

Age
#Inclusion
features['StartDate'] >= demographics['BirthDate'] + @YEARS(18) 

Histology_Squamous
#Inclusion
(demographics['Histology'] == 'Squamous cell carcinoma') 

ECOG
#Inclusion
(features['ECOG'] == 0) OR (features['ECOG'] == 1) 

Platelets
#Inclusion
(lab['LabName'] == 'Platelet count')
(lab['TestDate'] >= features['StartDate'] - @DAYS(28) ) AND (lab['TestDate'] <= features['StartDate'])
MIN(ABS(lab['TestDate'] - features['StartDate']))
lab['LabValue'] >= 100 

Bilirubin
#Inclusion
(lab['LabName'] == 'Total bilirubin')
(lab['TestDate'] >= features['StartDate'] - @DAYS(28) ) AND (lab['TestDate'] <= features['StartDate'])
MIN(ABS(lab['TestDate'] - features['StartDate']))
lab['LabValue'] <= 1 



## Preparation

1. Create an empty cohort object

In [7]:
patientids = features['PatientID']
cohort = tp.cohort_selection(patientids, name_PatientID='PatientID')

2. Add the tables needed in the eligibility criterion.

In [8]:
cohort.add_table('demographics', demographics)
cohort.add_table('lab', lab)
cohort.add_table('features', features)

3. Add individual eligibility criterion

In [9]:
# Option 1: add rules individually
for rule in criteria[:]:
    name_rule, select, missing = cohort.add_rule(rule)
    print('Rule %s: exclude patients %d/%d' % (name_rule, select.shape[0]-np.sum(select), select.shape[0]))
    
# # Option 2: add the list of criteria
# cohort.add_rules(criteria)

Rule Age: exclude patients 0/4000
Rule Histology_Squamous: exclude patients 2020/4000
Rule ECOG: exclude patients 1797/4000
Rule Platelets: exclude patients 1943/4000
Rule Bilirubin: exclude patients 2316/4000


# Analysis

- Treatment drug: A
- Control drug: B
- Criteria used: Age, ECOG, Histology_Squamous, Platelets, Bilirubin

In [10]:
drug_treatment = 'drug A'
drug_control = 'drug B'
name_rules = ['Age', 'Histology_Squamous', 'ECOG', 'Platelets', 'Bilirubin']
covariates = ['Gender', 'Race', 'ECOG']

1. Original trial crieria

In [None]:
HR, confidence_interval = tp.emulate_trials(cohort, features, drug_treatment, drug_control, name_rules)

2. Fully-relaxed criteria

In [None]:
HR, confidence_interval = tp.emulate_trials(cohort, features, drug_treatment, drug_control, [])

3. Compute shapley values

In [None]:
shapley_values = tp.shapley_computation(cohort, features, drug_treatment, drug_control, names_rules)

4. Data-driven criteria

In [None]:
names_rules_relax = names_rules[shapley_values < 0]
HR, confidence_interval = tp.emulate_trials(cohort, features, drug_treatment, drug_control, names_rules_relax)