# BS Score over time

The primary goal of this notebook is to visualize the BS score over time under various groupings and conditions

Additionally, some of the processed datasets will be saved to disk for later use.

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

In [8]:
basmi_df = pd.read_excel('data/clean_basmi.xls', index_col=(0,1))

# Turn the Drug column into binary
basmi_df['Drug_Indicator'] = basmi_df['Drug'].notnull().map({False: 0, True: 1})
basmi_df.drop('Drug',axis=1, inplace=True)

patients = basmi_df.index.get_level_values('patient_id').unique()

print('Number of patients in basmi dataset:', len(patients))

basmi_df.head()

Number of patients in basmi dataset: 910


Unnamed: 0_level_0,Unnamed: 1_level_0,CRS,TWS,LSFS,LFS,IMS,BS,Drug_Indicator
patient_id,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
40,1995-05-09,3,1,6,5,3,3.6,0
40,1995-06-01,3,1,8,5,3,4.0,0
40,1995-06-12,2,1,5,3,2,2.6,0
40,1995-11-02,1,1,3,4,2,2.2,0
40,1996-05-02,2,1,4,3,2,2.4,0


## 1.  Normalize the patient timeline

Normalize each patient's timeline to be 0 at the time they joined the study. 

In [25]:
basmi_df[basmi_df.index.get_level_values('patient_id') == 40]

Unnamed: 0_level_0,Unnamed: 1_level_0,CRS,TWS,LSFS,LFS,IMS,BS,Drug_Indicator
patient_id,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
40,1995-05-09,3,1,6,5,3,3.6,0
40,1995-06-01,3,1,8,5,3,4.0,0
40,1995-06-12,2,1,5,3,2,2.6,0
40,1995-11-02,1,1,3,4,2,2.2,0
40,1996-05-02,2,1,4,3,2,2.4,0
40,1997-02-06,3,1,5,4,2,3.0,0
40,1997-08-07,3,1,5,4,2,3.0,0
40,1998-05-06,3,1,7,4,2,3.4,0
40,1999-03-18,2,1,6,5,2,3.2,0
40,1999-03-31,2,1,6,5,1,3.0,0


In [39]:
def get_norm_years(df):
    dates = df.index.get_level_values('Date')
    start_date = min(dates)
    norm_years = [int(pd.Timedelta(date - start_date).days / 365) for date in dates]
    return norm_years

basmi_df['norm_years'] = basmi_df.groupby(level=0).transform(get_norm_years)
basmi_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CRS,TWS,LSFS,LFS,IMS,BS,Drug_Indicator,norm_years
patient_id,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
40,1995-05-09,3,1,6,5,3,3.6,0,0
40,1995-06-01,3,1,8,5,3,4.0,0,0
40,1995-06-12,2,1,5,3,2,2.6,0,0
40,1995-11-02,1,1,3,4,2,2.2,0,0
40,1996-05-02,2,1,4,3,2,2.4,0,0


## 2. Bin data per year for each patient
If patient has multiple measurements within a year, take the mean of those measurements.

Now each patient should have one score per year in study.

In [48]:
# BS data aggregated by patient for each year in study
agg_bs_df = basmi_df.groupby(['patient_id','norm_years']).agg({'BS': 'mean'}).reset_index(level=1)
agg_bs_df = agg_bs_df.round(2)
agg_bs_df

Unnamed: 0_level_0,norm_years,BS
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
40,0,2.96
40,1,3.00
40,2,3.20
40,3,3.10
40,4,3.40
40,5,3.10
40,6,3.60
40,7,3.50
40,8,3.00
40,9,2.80


#### 3. Impute the missing values
If a patient missed a year, impute the missing value by assuming a linear progression

To get the progression rate, use the difference between BS scores before and after missing values and divide by the time difference.


In [65]:
fixed_dfs = []
for id, df in agg_bs_df.groupby('patient_id'):
    
    years = df['norm_years']

    bs_scores = df['BS'] 

    rate_of_change = (bs_scores.shift(-1) - bs_scores) / (years.shift(-1) - years)
    
    if df.shape[0] <= 1:
        fixed_df = pd.DataFrame({'BS': bs_scores, 'norm_years': years, 'patient_id': id})
    else:
        bs_scores.index = years
        rate_of_change.index = years

        # Full range of years - the actual years that patient was in the study
        years_range = pd.RangeIndex(0, stop=max(years))

        fixed_data = []
        last_bs_obs = None
        for year in years_range:
            # If we had data for this year, add set the last observation
            # and add this entry to fixed data
            if year in years.values:
                last_obs = (bs_scores.loc[year], rate_of_change.loc[year])
                fixed_data.append(last_obs[0])

            # Else, make a new observation by adding the rate of change to the last BS score we had
            # and updating the last observation to this new observation keeping the rate of change the same
            else:
                new_obs = last_obs[0] + last_obs[1]
                fixed_data.append(new_obs)
                last_obs = (new_obs, last_obs[1])

        fixed_df = pd.DataFrame({'BS': fixed_data, 'norm_years': years_range})
        fixed_df['patient_id'] = id
    
    fixed_dfs.append(fixed_df)
    
fixed_bs_df = pd.concat(fixed_dfs)

fixed_bs_df.to_excel('data/bs_normed.xls', index=False)