In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set()

In [2]:
import sys
sys.path.append('../../')

import data_utils as util

## Load Demographics  and BASMI data

In [5]:
demo_df = pd.read_excel('../../data/demographics and Biologics data.xlsx', index_col=0)

# Get year of birth for estimating age
year_of_birth = pd.DataFrame(demo_df['year of Birth'])

# Subselect and rename some columns
# demo_df = demo_df[['patient_gender_id','patient_date_of_diagnosis']]
demo_df.rename(columns={'patient_gender_id': 'gender', 'patient_date_of_diagnosis': 'diagnosis_date'}, inplace=True)

demo_df.head()

Unnamed: 0_level_0,gender,diagnosis_date,Age at diagnosis,patient_hla_bUnknown7_id,EIBP,patient_condition_subtype,Current biologic,Number of biologics used (not including brand/biosimilar switch),year of Birth
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
40,Female,1994-01-01,46.255989,Positive,False,AS,Infliximab,1,1947
41,Male,2003-01-01,46.644764,Positive,False,AS,Adalimumab,1,1956
43,Male,1978-01-01,20.410678,Positive,False,AS,Adalimumab,1,1957
44,Male,1994-01-01,22.047912,Unknown,False,AS,Adalimumab,1,1971
45,Male,1998-01-01,37.048597,Positive,False,AS,Etanercept,1,1960


In [15]:
basmi_df = pd.read_excel('../../data/clean_basmi.xls', index_col=(0,1)).reset_index(level=1, drop=False)
basmi_df['Date'] = pd.to_datetime(basmi_df['Date'])

basmi_df.head()

Unnamed: 0_level_0,Date,CRS,TWS,LSFS,LFS,IMS,BS,Drug
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
40,1995-05-09,3,1,6,5,3,3.6,
40,1995-06-01,3,1,8,5,3,4.0,
40,1995-06-12,2,1,5,3,2,2.6,
40,1995-11-02,1,1,3,4,2,2.2,
40,1996-05-02,2,1,4,3,2,2.4,


### Merge demographics and BASMI data

In [16]:
full_df = pd.merge(basmi_df, demo_df, left_index=True, right_index=True)
full_df.head()

Unnamed: 0_level_0,Date,CRS,TWS,LSFS,LFS,IMS,BS,Drug,gender,diagnosis_date,Age at diagnosis,patient_hla_bUnknown7_id,EIBP,patient_condition_subtype,Current biologic,Number of biologics used (not including brand/biosimilar switch),year of Birth
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
40,1995-05-09,3,1,6,5,3,3.6,,Female,1994-01-01,46.255989,Positive,False,AS,Infliximab,1,1947
40,1995-06-01,3,1,8,5,3,4.0,,Female,1994-01-01,46.255989,Positive,False,AS,Infliximab,1,1947
40,1995-06-12,2,1,5,3,2,2.6,,Female,1994-01-01,46.255989,Positive,False,AS,Infliximab,1,1947
40,1995-11-02,1,1,3,4,2,2.2,,Female,1994-01-01,46.255989,Positive,False,AS,Infliximab,1,1947
40,1996-05-02,2,1,4,3,2,2.4,,Female,1994-01-01,46.255989,Positive,False,AS,Infliximab,1,1947


## Data Pre-processing & Setup

In [17]:
# Convert Drug to binary
full_df['Drug'] = full_df['Drug'].notnull()

# Add patient age using year of birth
full_df['Age'] = full_df['Date'].dt.year - full_df['year of Birth']
full_df.drop('year of Birth', axis=1, inplace=True)

# Bin age into 10 bins
bins = [np.floor(x) for x in np.linspace(full_df['Age'].values.min(), full_df['Age'].values.max(), 11)]
labels = range(1,11)
full_df['Age_cat'] = pd.cut(full_df['Age'], bins=bins, labels=labels)

# Drop some columns we don't really need right now
to_drop = ['diagnosis_date','Current biologic','Number of biologics used (not including brand/biosimilar switch)']
full_df.drop(to_drop, axis=1, inplace=True)
full_df.head()

Unnamed: 0_level_0,Date,CRS,TWS,LSFS,LFS,IMS,BS,Drug,gender,Age at diagnosis,patient_hla_bUnknown7_id,EIBP,patient_condition_subtype,Age,Age_cat
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
40,1995-05-09,3,1,6,5,3,3.6,False,Female,46.255989,Positive,False,AS,48,5
40,1995-06-01,3,1,8,5,3,4.0,False,Female,46.255989,Positive,False,AS,48,5
40,1995-06-12,2,1,5,3,2,2.6,False,Female,46.255989,Positive,False,AS,48,5
40,1995-11-02,1,1,3,4,2,2.2,False,Female,46.255989,Positive,False,AS,48,5
40,1996-05-02,2,1,4,3,2,2.4,False,Female,46.255989,Positive,False,AS,49,5


### Save full combined dataset to disk

In [18]:
full_df.to_csv('../../data/combined_full.csv')

## Split BASMI data into different cohorts:

* Treatment: Data of patients who while and after they underwent **any** treatment
* No Treatment: Data of patients who **have not** received any treatment

To be clear, consider the following example: Patient 40 entered the study and took no biologics and also did not undergo rehab for the first 12 years. However, after 12 years the patient started using a biologic drug. This means that the first 12 years of measurements of patient 40 will be added to the "no treatment" cohort, and the latter will be added to the "treatment" cohort. 



In [35]:
# Get the data for when patients were not using drugs
no_drugs_df = full_df[full_df['Drug'] == False]
drugs_df = full_df[full_df['Drug'] == True]

# Get the data for no-treatment cohort
no_treatment_dfs = []
rehab_dfs = []
for patient_id, patient_df in no_drugs_df.groupby('patient_id'):
    # The date of rehab ending
    rehab_date = patient_df[patient_df['Date'].diff().dt.days == 14]
    
    # If patient did rehab - clip the data at the date before going into rehab
    if not rehab_date.empty:
        # Date of rehab
        idx = rehab_date.Date.values[0]
        
        # Data of patient before having any treatment
        no_treatment = patient_df[patient_df['Date'] < idx]
        
        # Data of patient after getting rehab - must go into treatment cohort
        treatment = patient_df[patient_df['Date'] >= idx]
        
        rehab_dfs.append(treatment)
        no_treatment_dfs.append(no_treatment)
        
    # Else just add all the data for the patient
    else:
        no_treatment_dfs.append(patient_df)
        
print('Full dataset shape: {}'.format(full_df.shape))
    
no_treatment_df = pd.concat(no_treatment_dfs)
print('No-Treatment Dataset shape: {}'.format(no_treatment_df.shape))

# Merge rehab data and drugs data
rehab_df = pd.concat(rehab_dfs)
treatment_df = pd.concat([rehab_df, drugs_df])

print('Treatment Dataset shape: {}'.format(treatment_df.shape))

no_treatment_df.to_csv('../../data/no_treatment_cohort.csv')
treatment_df.to_csv('../../data/treatment_cohort.csv')

Full dataset shape: (14436, 15)
No-Treatment Dataset shape: (11228, 15)
Treatment Dataset shape: (3208, 15)


### Normalize timeline for different cohorts and save to disk

In [41]:
# Normalized No-Treatment cohort data
nt_cohort_norm = util.normalize_timeline(no_treatment_df)

# Normalized Treatment cohort data
t_cohort_norm = util.normalize_timeline(treatment_df)


nt_cohort_norm.to_csv('../../data/no_treatment_cohort_normed.csv')
t_cohort_norm.to_csv('../../data/treatment_cohort_normed.csv')