In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## all features of each dataset

This is a table of all the columns in each dataset, there are a lot so it might be useful to see all of them in one table

In [3]:
datadict = pd.ExcelFile('./Simulacrum-data-dictionary.xlsx')
sheets = datadict.sheet_names[1:]
DF = pd.DataFrame()
for sheet in sheets:
    df = pd.read_excel(datadict,sheet)
    df[sheet] = df['Field name']
    DF = pd.concat([DF,df[sheet]], axis=1)

DF

Unnamed: 0,AV_PATIENT,AV_TUMOUR,SACT PATIENT,SACT TUMOUR,SACT REGIMEN,SACT OUTCOME,SACT CYCLE,SACT DRUG_DETAIL
0,PATIENTID,TUMOURID,MERGED_PATIENT_ID,MERGED_TUMOUR_ID,MERGED_REGIMEN_ID,MERGED_OUTCOME_ID,MERGED_CYCLE_ID,MERGED_DRUG_DETAIL_ID
1,SEX,PATIENTID,LINK_NUMBER,MERGED_PATIENT_ID,MERGED_TUMOUR_ID,MERGED_REGIMEN_ID,MERGED_REGIMEN_ID,MERGED_CYCLE_ID
2,LINKNUMBER,DIAGNOSISDATEBEST,,CONSULTANT_SPECIALITY_CODE,HEIGHT_AT_START_OF_REGIMEN,DATE_OF_FINAL_TREATMENT,CYCLE_NUMBER,ORG_CODE_OF_DRUG_PROVIDER
3,ETHNICITY,SITE_ICD10_O2,,PRIMARY_DIAGNOSIS,WEIGHT_AT_START_OF_REGIMEN,REGIMEN_MOD_DOSE_REDUCTION,START_DATE_OF_CYCLE,ACTUAL_DOSE_PER_ADMINISTRATION
4,DEATHCAUSECODE_1A,SITE_ICD10_O2_3CHAR,,MORPHOLOGY_CLEAN,INTENT_OF_TREATMENT,REGIMEN_MOD_TIME_DELAY,OPCS_PROCUREMENT_CODE,OPCS_DELIVERY_CODE
5,DEATHCAUSECODE_1B,MORPH_ICD10_O2,,,DATE_DECISION_TO_TREAT,REGIMEN_MOD_STOPPED_EARLY,PERF_STATUS_START_OF_CYCLE,ADMINISTRATION_ROUTE
6,DEATHCAUSECODE_1C,BEHAVIOUR_ICD10_O2,,,START_DATE_OF_REGIMEN,REGIMEN_OUTCOME_SUMMARY,MERGED_PATIENT_ID,ADMINISTRATION_DATE
7,DEATHCAUSECODE_2,T_BEST,,,MAPPED_REGIMEN,MERGED_PATIENT_ID,MERGED_TUMOUR_ID,DRUG_GROUP
8,DEATHCAUSECODE_UNDERLYING,N_BEST,,,CLINICAL_TRIAL,MERGED_TUMOUR_ID,,MERGED_PATIENT_ID
9,DEATHLOCATIONCODE,M_BEST,,,CHEMO_RADIATION,,,MERGED_TUMOUR_ID


# patient

In [None]:
av_patient = pd.read_csv('./simulacrum_release_v1.1.0/sim_av_patient.csv')
display(av_patient.head())

frequency of each death cause:

In [None]:
vc = av_patient['DEATHCAUSECODE_UNDERLYING'].value_counts()
ax = pd.DataFrame(vc).T.plot.bar(stacked=True)
ax.legend(vc.keys()[0:14]);
ax.set_xticklabels(labels=['deathcause'],rotation=0)

# tumour

In [None]:
av_tumour = pd.read_csv('./simulacrum_release_v1.1.0/sim_av_tumour.csv',low_memory=False)
display(av_tumour.head())
print(av_tumour.shape)

Making some bar plots of the tumour features, colour coded with the tumour site:

In [None]:
tumour_features = av_tumour[['SITE_ICD10_O2_3CHAR', 'BEHAVIOUR_ICD10_O2', 'T_BEST',
                            'N_BEST', 'M_BEST', 'STAGE_BEST', 'STAGE_BEST_SYSTEM', 'GRADE', 'AGE',
                            'CANCERCAREPLANINTENT', 'PERFORMANCESTATUS', 'ACE27','LATERALITY']]
vc = av_tumour['SITE_ICD10_O2_3CHAR'].value_counts()
vc[:20] #top20tumoursites

The colours in these plots are the site of the tumour, the legend only shows the top 20 sites

In [None]:
tf_keys = list(tumour_features.keys())
tf_keys.remove('SITE_ICD10_O2_3CHAR')
fig, axes = plt.subplots(nrows=len(tf_keys), ncols=1, figsize=(10,4*len(tf_keys)))
fig.tight_layout()
for i,col in enumerate(tf_keys):
    
    feature = tumour_features[['SITE_ICD10_O2_3CHAR',col]] \
                        .groupby([col, 'SITE_ICD10_O2_3CHAR']) \
                        .agg({'SITE_ICD10_O2_3CHAR':'size'}) \
                        .rename(columns={'SITE_ICD10_O2_3CHAR':'site count'}) \
                        .reset_index()
    feature = feature.pivot(index=col, columns='SITE_ICD10_O2_3CHAR')['site count']
    ax = feature.plot(kind='bar', stacked=True, ax=axes[i])
    handles, labels = ax.get_legend_handles_labels()
    top20index = [labels.index(v) for v in vc[:20].keys()]
    top20handles = [handles[i] for i in top20index]
    ax.legend(top20handles, vc[:20].keys(),loc='best', bbox_to_anchor=(0.62, 0.7, 0.5, 0.5),prop={'size':9})
    ax.set_ylabel('counts')
plt.subplots_adjust(hspace=0.5)

# merging

av_tumour and av_patient

In [None]:
tumour_patient = pd.merge(av_tumour,av_patient,on='PATIENTID')
print(av_tumour.shape,av_patient.shape,tumour_patient.shape) 

create column with days between date of vital status taken and date of diagnosis

In [None]:
from datetime import date

diagnosis_dates = pd.to_datetime(tumour_patient['DIAGNOSISDATEBEST'])
vitalstat_dates = pd.to_datetime(tumour_patient['VITALSTATUSDATE'])

tumour_patient['VITAL - DIAGNOSIS (days)'] = vitalstat_dates - diagnosis_dates
tumour_patient['VITAL - DIAGNOSIS (days)'] = [i.days for i in tumour_patient['VITAL - DIAGNOSIS (days)']]
tumour_patient = tumour_patient[tumour_patient['VITAL - DIAGNOSIS (days)'] >= 0] #some have vital status in 1900

print("stats for days being alive")
tumour_patient['VITAL - DIAGNOSIS (days)'].describe()

plots of mean days alive from patients that died

In [None]:
dead_patient = tumour_patient[tumour_patient['NEWVITALSTATUS'] == 'D']

time_behav = dead_patient[dead_patient['BEHAVIOUR_ICD10_O2'] != 'XXX'].dropna(subset=['BEHAVIOUR_ICD10_O2'])
time_behav['BEHAVIOUR_ICD10_O2'] = time_behav['BEHAVIOUR_ICD10_O2'].astype(int)
time_behav['VITAL - DIAGNOSIS (days)'] = time_behav['VITAL - DIAGNOSIS (days)'].astype(float)
mean_days_alive = time_behav[['SITE_ICD10_O2_3CHAR','BEHAVIOUR_ICD10_O2' , 'VITAL - DIAGNOSIS (days)']] \
                    .groupby(['SITE_ICD10_O2_3CHAR','BEHAVIOUR_ICD10_O2']) \
                    .agg({'SITE_ICD10_O2_3CHAR':'size', 'VITAL - DIAGNOSIS (days)':'mean'}) \
                    .rename(columns={'SITE_ICD10_O2_3CHAR':'site count','VITAL - DIAGNOSIS (days)':'mean days alive'}) \
                    .reset_index()

In [None]:
p = sns.relplot(x="BEHAVIOUR_ICD10_O2", y="mean days alive", hue="SITE_ICD10_O2_3CHAR", size="site count",
            sizes=(10, 1000), alpha=.7, palette="bright",
            height=5, data=mean_days_alive)
handles, labels = p.ax.get_legend_handles_labels()
top20index = [labels.index(v) for v in vc[:20].keys()]
top20handles = [handles[i] for i in top20index]
p.ax.legend(top20handles, vc[:20].keys(),loc='right', bbox_to_anchor=(0.8, 0.25, 0.5, 0.5),prop={'size':9})
p._legend.remove()

Different colors mean different tumour sites

The sizes of the blobs represent the number of people who have that tumour

There isn't a clear correlation between the time alive and the behaviour of the tumour(how bad the tumour is)

# sact_cycle

In [None]:
sact_cycle = pd.read_csv('simulacrum_release_v1.1.0/sim_sact_cycle.csv')
sact_cycle.head()

In [None]:
print('shape ',sact_cycle.shape)

for col in list(sact_cycle.columns):
    print(col," ",sact_cycle[col].unique()[0:6])

# drug_detail

In [None]:
drug_detail = pd.read_csv('simulacrum_release_v1.1.0/sim_sact_drug_detail.csv')
drug_detail.head()

In [None]:
print('shape ',drug_detail.shape)

for col in list(drug_detail.columns):
    print(col," ",drug_detail[col].unique()[0:6])

# outcome

In [None]:
outcome = pd.read_csv('simulacrum_release_v1.1.0/sim_sact_outcome.csv')
outcome.head()

In [None]:
print('shape ',outcome.shape)

for col in list(outcome.columns):
    print(col," ",outcome[col].unique()[0:6])

# sact_patient

In [None]:
sact_patient = pd.read_csv('simulacrum_release_v1.1.0/sim_sact_patient.csv')
sact_patient.head()

In [None]:
print('shape ',sact_patient.shape)

for col in list(sact_patient.columns):
    print(col," ",sact_patient[col].unique()[0:6])

# sact_regimen

https://stackoverflow.com/questions/18171739/unicodedecodeerror-when-reading-csv-file-in-pandas-with-python - loading sim_sact_regimen

In [None]:
sact_regimen = pd.read_csv('simulacrum_release_v1.1.0/sim_sact_regimen.csv',encoding="ISO-8859-1")
sact_regimen.head()

In [None]:
range(9)[1]

# sact_tumour

In [None]:
sact_tumour = pd.read_csv('simulacrum_release_v1.1.0/sim_sact_tumour.csv')
sact_regimen = pd.read_csv('simulacrum_release_v1.1.0/sim_sact_regimen.csv',encoding="ISO-8859-1")
sact_patient = pd.read_csv('simulacrum_release_v1.1.0/sim_sact_patient.csv')
sact_tumour.head()

In [None]:
print('shape ',sact_tumour.shape)

for col in list(sact_tumour.columns):
    print(col," ",sact_tumour[col].unique()[0:6])