In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set()

In [2]:
import sys
sys.path.append('../')

import data_utils as util

## Load Demographics  and BASMI data

In [3]:
demo_df = pd.read_excel('../data/demographics and Biologics data.xlsx', index_col=0)

# Get year of birth for estimating age
year_of_birth = pd.DataFrame(demo_df['year of Birth'])

# Subselect and rename some columns
demo_df = demo_df[['patient_gender_id','patient_date_of_diagnosis']]
demo_df.rename(columns={'patient_gender_id': 'gender', 'patient_date_of_diagnosis': 'diagnosis_date'}, inplace=True)

basmi_df = pd.read_excel('../data/clean_basmi.xls', index_col=(0,1)).reset_index(level=1, drop=False)
basmi_df.head()

Unnamed: 0_level_0,Date,CRS,TWS,LSFS,LFS,IMS,BS,Drug
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
40,1995-05-09,3,1,6,5,3,3.6,
40,1995-06-01,3,1,8,5,3,4.0,
40,1995-06-12,2,1,5,3,2,2.6,
40,1995-11-02,1,1,3,4,2,2.2,
40,1996-05-02,2,1,4,3,2,2.4,


## Data Pre-processing & Setup

In [4]:
# Add year of birth to BASMI data
df = pd.merge(basmi_df, year_of_birth, left_index=True, right_index=True)

# Convert Drug to binary
df['Drug'] = df['Drug'].notnull()

# Add patient age using year of birth
df['Age'] = df['Date'].dt.year - df['year of Birth']
df.drop('year of Birth', axis=1, inplace=True)

# Bin age into 10 bins
bins = [np.floor(x) for x in np.linspace(df['Age'].values.min(), df['Age'].values.max(), 11)]
labels = range(1,11)
df['Age_cat'] = pd.cut(df['Age'], bins=bins, labels=labels)

df.head()

Unnamed: 0_level_0,Date,CRS,TWS,LSFS,LFS,IMS,BS,Drug,Age,Age_cat
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
40,1995-05-09,3,1,6,5,3,3.6,False,48,5
40,1995-06-01,3,1,8,5,3,4.0,False,48,5
40,1995-06-12,2,1,5,3,2,2.6,False,48,5
40,1995-11-02,1,1,3,4,2,2.2,False,48,5
40,1996-05-02,2,1,4,3,2,2.4,False,49,5


### Normalize patient timeline

In [5]:
def get_norm_years(dates):
    """
    Different from get_norm_years in data_utils
    
    ->  Calculates normlized timeline using start year as 0 
        and increments for each subsequent year
    
    ->  Originally increments for every year included in study
    
    For example, if patient joined in May 1995, next year increment would
    have been June 1996. Now it will increment on Jan 1996. 
    """
    
    years = [d.year for d in dates]
    start_year = min(years)
    norm_years = [year - start_year for year in years]
    
    return norm_years


df['norm_years'] = df.groupby(['patient_id'])['Date'].transform(get_norm_years)

df.head()

Unnamed: 0_level_0,Date,CRS,TWS,LSFS,LFS,IMS,BS,Drug,Age,Age_cat,norm_years
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
40,1995-05-09,3,1,6,5,3,3.6,False,48,5,0
40,1995-06-01,3,1,8,5,3,4.0,False,48,5,0
40,1995-06-12,2,1,5,3,2,2.6,False,48,5,0
40,1995-11-02,1,1,3,4,2,2.2,False,48,5,0
40,1996-05-02,2,1,4,3,2,2.4,False,49,5,1


### Aggregate by year in study

For each patient, calculate aggregated scores for every year in the study

* TODO - generalize this to be able to aggregate for other periods

In [6]:
def agg_by_norm_year(df, agg_dict={}):
    return df.groupby(['patient_id','norm_years']).agg(agg_dict)

# Aggregate the data for each patient by normalized year
# Keep the actual year of each norm_year
agg_dict={'BS': 'mean', 'Age': min, 'Drug': np.any, 'Date': lambda x: x.iloc[0].year}
agg_df = agg_by_norm_year(df,agg_dict).rename(columns={'Date': 'year'})

# Add Gender and Diagnosis Date to aggregated, normalized data
agg_df = demo_df.join(agg_df).reset_index().set_index('patient_id')


# Add time since diagnosis variable
agg_df['time_since_diagnosis'] = agg_df['year'] - agg_df['diagnosis_date'].dt.year
agg_df.drop('diagnosis_date', axis=1, inplace=True)
agg_df.head()

Unnamed: 0_level_0,norm_years,gender,BS,Age,Drug,year,time_since_diagnosis
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
40,0,Female,3.1,48,False,1995,1.0
40,1,Female,2.4,49,False,1996,2.0
40,2,Female,3.0,50,False,1997,3.0
40,3,Female,3.4,51,False,1998,4.0
40,4,Female,3.3,52,False,1999,5.0


# Implement LMM 

Start with using `Drug` and `norm_years` as the predictors and using `BS` as the outcome. 

Thus, we have:

\begin{equation}
\mathbf{y} = \mathbf{X}\mathbf{\beta} + \mathbf{Z}\mathbf{u} + \mathbf{\epsilon}
\end{equation}

In [35]:
ids_list = agg_df.index 
patient_ids = ids_list.unique()

n_patients = len(patient_ids)
n_samples = agg_df.shape[0]

ids_to_idx = {p_id:idx for idx,p_id in enumerate(patient_ids)}
samples_idx = list(range(n_samples))

y = agg_df['BS'].values

X = agg_df[['norm_years', 'Drug']]
X['Drug'] = X["Drug"].astype(int)



# Z = np.zeros(shape=(n_samples, n_patients))

# # populate the Z array with 1's for each sample in the column
# # corresponding to patient 
# for idx, row in enumerate(Z):
#     # patient ID that this sample belongs to
#     p_id = ids_list[idx]
    
#     # The column index of this ID
#     p_id_idx = ids_to_idx[p_id]
    
#     row[p_id_idx] = 1
    
# Z.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


(8472, 688)

In [43]:
X_fe = X['Drug']
X_re = X['norm_years']

In [45]:
from statsmodels.regression.mixed_linear_model import MixedLM
lm = MixedLM(endog=y, exog=X_fe.values, groups=ids_list, exog_re = X_re.values)

result = lm.fit()

result

<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x108577400>

In [53]:
result.bse

array([ 0.14978339,  0.00317433])

In [47]:
result.random_effects

{40: x_re1    0.298459
 dtype: float64, 41: x_re1    1.093174
 dtype: float64, 43: x_re1    0.115722
 dtype: float64, 44: x_re1    0.46347
 dtype: float64, 45: x_re1    0.350435
 dtype: float64, 46: x_re1    0.195203
 dtype: float64, 47: x_re1    0.431782
 dtype: float64, 50: x_re1    0.026166
 dtype: float64, 51: x_re1    0.303885
 dtype: float64, 52: x_re1    0.152704
 dtype: float64, 53: x_re1    0.533961
 dtype: float64, 54: x_re1    0.586661
 dtype: float64, 55: x_re1    0.537485
 dtype: float64, 57: x_re1    0.188981
 dtype: float64, 59: x_re1    0.532256
 dtype: float64, 60: x_re1    0.463901
 dtype: float64, 61: x_re1    0.421323
 dtype: float64, 63: x_re1    0.297384
 dtype: float64, 65: x_re1    0.112247
 dtype: float64, 67: x_re1    0.207302
 dtype: float64, 68: x_re1    0.097625
 dtype: float64, 69: x_re1    0.22988
 dtype: float64, 70: x_re1    0.387373
 dtype: float64, 71: x_re1    0.339904
 dtype: float64, 72: x_re1    0.251682
 dtype: float64, 73: x_re1    0.089757
 dty