In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("characteristic.csv")
bin_feat_list = []
num_feat_list = []

In [3]:
df = df.rename(columns={
    'ADHERENC': 'adherence',
    'agecat': 'age',
    'ARM': 'arms',
    'BAD_TOX': 'serious adverse effect',
    'BWL_OBS': 'bowel obstruction',
    'BWL_PERF': 'bowel perforation',
    'HISTO_G': 'histology',
    'PS': 'ECOG performance score',
    'racecat': 'race',
    'SEX':'sex',
    'wild':'biomarker KRAS',
    'bmi2':'bmi',
}
)

In [4]:
df['arms'] = df['arms'].replace(
{
    'A': 'Oxaliplatin + 5-fluorouracil/Leucovorin',
    'B': '5-fluorouracil/Leucovorin + Irinotecan',
    'C': '5-fluorouracil/Leucovorin + Irinotecan (KRAS mutant)',
    'D': 'Oxaliplatin + 5-fluorouracil/Leucovorin + Cetuximab',
    'E': '5-fluorouracil/Leucovorin + Cetuximab + Irinotecan',
    'F': '5-fluorouracil/Leucovorin + Cetuximab + Irinotecan',
}
)

In [5]:
df['adherence'] = df['adherence'].replace({1:'Yes',2:'No'})
bin_feat_list.append('adherence')

In [6]:
df['age'] = df['age'].replace({
    '< 40': 40,
    '40-69': 55,
    '>= 70': 70,
}).astype(float)
num_feat_list.append('age')

In [7]:
df['serious adverse effect'] = df['serious adverse effect'].apply(lambda x: 'Yes' if x == 1 else 'No')
bin_feat_list.append('serious adverse effect')

In [8]:
df['bowel obstruction'] = df['bowel obstruction'].apply(lambda x: 'Yes' if x == 1 else 'No')
bin_feat_list.append('bowel obstruction')

In [9]:
df['bowel perforation'] = df['bowel perforation'].apply(lambda x: 'Yes' if x == 1 else 'No')
bin_feat_list.append('bowel perforation')

In [10]:
df['histology'] = df['histology'].replace({
    1:'poorly differentiated',
    2:'well differentiated',
})

In [11]:
df['ECOG performance score'] = df['ECOG performance score'].fillna(0).astype(int)
num_feat_list.append('ECOG performance score')

In [12]:
df['race'] = df['race'].replace({'b':'black','w':'white','oth':'others'})

In [13]:
df['sex'] = df['sex'].replace({'m':'male','f':'female'})

In [14]:
df['biomarker KRAS'] = df['biomarker KRAS'].replace({0:'mutant',1:'wild-type'}).fillna('interminate')

In [15]:
df['bmi'] = df['bmi'].fillna(df['bmi'].median())
num_feat_list.append('bmi')

In [16]:
df_processed = df[[
    'adherence',
    'age',
    'arms',
    'serious adverse effect',
    'bowel obstruction',
    'bowel perforation',
    'histology',
    'ECOG performance score',
    'race',
    'sex',
    'biomarker KRAS',
    'bmi',
    'mask_id',
]]

In [17]:
df_processed

Unnamed: 0,adherence,age,arms,serious adverse effect,bowel obstruction,bowel perforation,histology,ECOG performance score,race,sex,biomarker KRAS,bmi,mask_id
0,No,55.0,Oxaliplatin + 5-fluorouracil/Leucovorin,No,No,No,well differentiated,0,others,female,wild-type,20.974482,1
1,No,55.0,Oxaliplatin + 5-fluorouracil/Leucovorin + Cetu...,No,No,No,well differentiated,0,white,male,wild-type,29.137170,2
2,No,55.0,Oxaliplatin + 5-fluorouracil/Leucovorin + Cetu...,No,No,No,well differentiated,0,white,female,wild-type,49.270950,3
3,No,55.0,Oxaliplatin + 5-fluorouracil/Leucovorin,No,No,No,well differentiated,0,white,male,mutant,31.572831,4
4,Yes,55.0,Oxaliplatin + 5-fluorouracil/Leucovorin + Cetu...,No,No,No,poorly differentiated,0,white,male,wild-type,16.776938,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2963,No,55.0,Oxaliplatin + 5-fluorouracil/Leucovorin + Cetu...,No,No,No,well differentiated,0,white,male,mutant,29.620181,2964
2964,No,70.0,Oxaliplatin + 5-fluorouracil/Leucovorin,No,No,No,poorly differentiated,0,white,male,wild-type,32.151914,2965
2965,Yes,55.0,Oxaliplatin + 5-fluorouracil/Leucovorin,Yes,No,No,well differentiated,0,white,male,mutant,26.215516,2966
2966,No,55.0,Oxaliplatin + 5-fluorouracil/Leucovorin + Cetu...,No,No,No,well differentiated,0,black,female,wild-type,37.669095,2967


In [18]:
df_obj = pd.read_csv("objectives.csv")

In [19]:
df_obj['target_label'] = df_obj['fustat8']

In [20]:
df_tox = pd.read_csv('tox.csv')

In [21]:
df_tox['is serious'] = df_tox['GRADE'].apply(lambda x: 1 if x > 3 else 0)

In [22]:
ae_list = df_tox['tox'].value_counts().index.tolist()
ae_name_list = []
for ae in ae_list:
    ae_name = 'adverse effect: ' + ae.lower()
    df_tox[ae_name] = np.zeros(len(df_tox))
    df_tox.loc[df_tox['tox']==ae,ae_name] = 1
    df_tox[ae_name] = df_tox[ae_name] * df_tox['is serious']
    ae_name_list.append(ae_name)

In [23]:
df_tox = df_tox[ae_name_list+['mask_id']].groupby('mask_id').max().reset_index()

In [24]:
df_tox.sum()

mask_id                                 1945799.0
adverse effect: paresthesias                  8.0
adverse effect: diarrhea                     15.0
adverse effect: acne/rash                     6.0
adverse effect: fatigue                       7.0
adverse effect: infection                    14.0
adverse effect: nausea                        2.0
adverse effect: stomatitis/mucositis          0.0
adverse effect: thrombosis                   44.0
adverse effect: hypersensitivity             16.0
adverse effect: vomiting                      3.0
adverse effect: dyspnea                       5.0
adverse effect: febrile neutropenia           8.0
adverse effect: anorexia                      1.0
adverse effect: hypomagnesemia                8.0
adverse effect: infarction                   10.0
adverse effect: pneumonia                     1.0
adverse effect: pneumonitis                   6.0
adverse effect: weight loss                   0.0
dtype: float64

In [37]:
sub_ae_cols = ['adverse effect: thrombosis', 'adverse effect: hypersensitivity', 'adverse effect: infarction', 'adverse effect: diarrhea']
bin_feat_list += sub_ae_cols

In [26]:
df_processed = df_processed.merge(df_tox[sub_ae_cols+['mask_id']],how='left').fillna(0)

In [27]:
df_processed[sub_ae_cols]= df_processed[sub_ae_cols].replace({0:'No',1:'Yes'})

In [32]:
df_processed = df_processed.merge(df_obj[['target_label','mask_id']])

In [35]:
df_processed.drop('mask_id', axis=1).to_csv('NCT00079274.csv')

In [38]:
with open('numerical_feature.txt','w') as f:
    for x in num_feat_list:
        f.write(x+'\n')
with open('binary_feature.txt','w') as f:
    for x in bin_feat_list:
        f.write(x+'\n')