In [1]:
import pandas as pd
import os
import math

In [2]:
def generate_empty_dataframe(patients,bin_width):
    bins = []
    for i in range(0,360,bin_width):
        bins.append("{} - {}".format(i,i+bin_width))
    bins.append("{}+".format(i+bin_width))
    return pd.DataFrame(index=patients,columns=bins,data=0)

def populate_df(raw_data, bin_width):
    data = raw_data.dropna(subset=['ANON_ID','time_since_last_TX'])
    df = generate_empty_dataframe(sorted(data['ANON_ID'].unique()),BIN_WIDTH)
    for i,row in data.iterrows():
        if row['time_since_last_TX'] < 0:
            icol = 0
        else:
            icol = math.floor(row['time_since_last_TX'] / bin_width)
        icol = min(icol,len(df.columns) - 1)
        col = df.columns[icol]
        df.at[row['ANON_ID'],col] += 1
    mean_responses = pd.Series(df.mean().round(0).astype(int),name='mean')
    total_responses = pd.Series(df.sum(),name='total')
    df = df.append([mean_responses,total_responses])
    return df

def check_data(data,early=1,late=1):
    print("Checking which patients have survey results to cover early xerostomia, as defined by the first {} bin(s).".format(early))
    void = len(data[data.iloc[:,:early].sum(axis=1)==0])
    total = len(data)
    print("{}/{} patients have data for early xerostomia.\n".format(total - void, total))
    
    print("Checking which patients have survey results to cover late xerostomia, as defined by the last {} bin(s).".format(late))
    void2 = len(data[data.iloc[:,-late:].sum(axis=1)==0])
    print("{}/{} patients have data for late xerostomia.\n".format(total - void2, total))
    
    print("Checking which patients have survey results to cover both early and late.")
    void3 = len(data[(data.iloc[:,:early].sum(axis=1)==0)|(data.iloc[:,-late:].sum(axis=1)==0)])
    print("{}/{} patients have data for both.".format(total - void3, total))

In [4]:
general = pd.read_csv("HeadAndNeckQOLSurvey-General_anon_with_time_since.csv")
xero = pd.read_csv("HeadAndNeckXeQOLSurv_anon_with_time_since.csv")

# Assessing distribution of time since treatment
Using the date of last treatment as treatment completion date, we can determine the time elapsed since treatment completion for each of the survey responses.

We'll check the distributions for each survey independently (general QOL and xerostomia QOL).

## Methodology

Prior to this workbook, I used a function to map each survey date to the date of last treatment and assign a "time since last treatment" value to every survey. This workbook is only for visualization: we'll break time into 30 day blocks and quantify the number of surveys for each patient that fall within each bin, from 0-30 all the way to 330-360 and finally 360+.

## General QOL Survey

In [10]:
BIN_WIDTH = 30

anonID = sorted(general['ANON_ID'].unique())
empty_df = generate_empty_dataframe(anonID,BIN_WIDTH)
gen_df = populate_df(general,BIN_WIDTH)

display(gen_df)

Unnamed: 0,0 - 30,30 - 60,60 - 90,90 - 120,120 - 150,150 - 180,180 - 210,210 - 240,240 - 270,270 - 300,300 - 330,330 - 360,360+
017_051,2,1,0,0,0,1,0,0,1,0,1,0,4
017_052,2,1,0,1,0,0,0,0,0,1,0,0,0
017_054,0,0,0,0,0,0,0,0,0,0,0,0,2
017_055,2,1,0,1,0,0,1,0,0,1,0,0,5
017_056,3,0,0,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
018_130,3,0,0,1,0,0,0,0,0,1,0,0,3
018_131,3,0,1,0,0,0,0,0,0,1,0,0,1
018_132,3,0,0,1,0,1,0,0,0,1,0,0,2
mean,2,0,0,1,0,0,0,0,0,0,0,0,2


In [11]:
check_data(gen_df)

Checking which patients have survey results to cover early xerostomia, as defined by the first 1 bin(s).
214/221 patients have data for early xerostomia.

Checking which patients have survey results to cover late xerostomia, as defined by the last 1 bin(s).
129/221 patients have data for late xerostomia.

Checking which patients have survey results to cover both early and late.
125/221 patients have data for both.


In [12]:
check_data(gen_df,early=1,late=2)

Checking which patients have survey results to cover early xerostomia, as defined by the first 1 bin(s).
214/221 patients have data for early xerostomia.

Checking which patients have survey results to cover late xerostomia, as defined by the last 2 bin(s).
131/221 patients have data for late xerostomia.

Checking which patients have survey results to cover both early and late.
127/221 patients have data for both.


## Xerostomia-specific survey

In [13]:
BIN_WIDTH = 30

anonIDx = sorted(xero['ANON_ID'].unique())
empty_dfx = generate_empty_dataframe(anonID,BIN_WIDTH)
xer_df = populate_df(xero,BIN_WIDTH)

display(xer_df)

Unnamed: 0,0 - 30,30 - 60,60 - 90,90 - 120,120 - 150,150 - 180,180 - 210,210 - 240,240 - 270,270 - 300,300 - 330,330 - 360,360+
017_057,0,0,0,0,0,0,0,0,0,0,0,0,3
017_067,2,0,0,0,0,0,0,0,0,1,0,0,0
017_068,0,0,0,0,2,0,0,0,0,1,0,0,4
017_069,1,0,0,0,1,0,0,0,1,1,0,0,3
017_075,1,1,0,0,0,0,1,0,1,0,0,0,4
017_076,0,1,0,1,0,0,0,1,1,0,0,0,1
017_080,0,0,0,0,0,0,0,0,0,0,0,0,2
017_081,1,1,1,0,0,1,0,0,0,0,0,0,2
017_086,0,0,0,0,0,0,0,0,0,0,0,0,1
017_096,0,0,0,0,0,0,1,1,0,0,0,0,0


In [14]:
check_data(xer_df)

Checking which patients have survey results to cover early xerostomia, as defined by the first 1 bin(s).
13/22 patients have data for early xerostomia.

Checking which patients have survey results to cover late xerostomia, as defined by the last 1 bin(s).
19/22 patients have data for late xerostomia.

Checking which patients have survey results to cover both early and late.
11/22 patients have data for both.


In [15]:
check_data(xer_df,early=4,late=1)

Checking which patients have survey results to cover early xerostomia, as defined by the first 4 bin(s).
14/22 patients have data for early xerostomia.

Checking which patients have survey results to cover late xerostomia, as defined by the last 1 bin(s).
19/22 patients have data for late xerostomia.

Checking which patients have survey results to cover both early and late.
12/22 patients have data for both.


# Assess representation of xerostomia in survey responses

In [16]:
xero_cols = [col for col in general.columns if any((("sticky_saliva" in col),("dry_mouth" in col)))]

In [17]:
early_responses = general[general['time_since_last_TX'] < 30]
late_responses = general[general['time_since_last_TX'] > 360]

In [19]:
patient_early = early_responses.groupby('ANON_ID').max()[['dry_mouth','sticky_saliva']]
patient_late = late_responses.groupby('ANON_ID').max()[['dry_mouth','sticky_saliva']]
labels = pd.DataFrame(index=anonID,columns=['early','late'])

In [26]:
data = late_responses

for patient in anonID:
    dm_score = data[data['ANON_ID']==patient]['dry_mouth'].max()
    ss_score = data[data['ANON_ID']==patient]['sticky_saliva'].max()
    overall_score = (dm_score + ss_score) / 2
    if overall_score > 2.5:
        labels.at[patient,'late'] = 1
    else:
        labels.at[patient,'late'] = 0

In [43]:
labels[(labels['early']==0)&(labels['late']==1)].shape

(17, 2)

In [39]:
127 - 43

84