In [1]:
from sas7bdat import SAS7BDAT
import random
import numpy as np
random.seed(1234)
np.random.seed(1234)

In [2]:
binary_feat_list = []
num_feat_list = []

In [3]:
with SAS7BDAT('demog.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()
df_processed = df[['AGENO','RUSUBJID']].rename(columns={'AGENO':'age'})
num_feat_list.append('age')

In [4]:
with SAS7BDAT('ae.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()
df = df[df['SERIOUS'] == 'Yes']
ae_list = df['AE_SEL'].value_counts()[1:6].index.tolist()
ae_cols = []
for ae in ae_list:
    ae_name = 'adverse effect: ' + ae.lower()
    df[ae_name] = np.zeros(len(df))
    df.loc[df['AE_SEL']==ae,ae_name] = 1
    ae_cols.append(ae_name)
df_ae = df[ae_cols+['RUSUBJID']].groupby('RUSUBJID').max().reset_index()
df_ae.replace({1:'Yes',0:'No'},inplace=True)
binary_feat_list.extend(ae_cols)

In [5]:
df_processed = df_processed.merge(df_ae,on='RUSUBJID',how='left')

In [6]:
df_processed[ae_cols] = df_processed[ae_cols].fillna('No')

In [7]:
df_processed[num_feat_list]=df_processed[num_feat_list].fillna(df_processed[num_feat_list].median())

In [8]:
with SAS7BDAT('antitumo.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()
tumo_med_list = df['CTX_L'].value_counts()[:5].index.tolist()
tumo_med_cols = []
for med in tumo_med_list:
    med_name = 'anti-tumor therapy: ' + med.lower()
    df[med_name] = np.zeros(len(df))
    df.loc[df['CTX_L']==med, med_name] = 1
    tumo_med_cols.append(med_name)
df_tumomed = df[tumo_med_cols+['RUSUBJID']].groupby('RUSUBJID').max().reset_index()
df_tumomed.replace({1:'Yes',0:'No'},inplace=True)
binary_feat_list.extend(tumo_med_cols)

In [9]:
df_processed = df_processed.merge(df_tumomed,on='RUSUBJID',how='left')
df_processed[tumo_med_cols] = df_processed[tumo_med_cols].fillna('No')

In [10]:
with SAS7BDAT('diag2.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()
df = df[['RESALN','POSALN','RUSUBJID']].rename(columns={'RESALN':'number of resected axillary node', 'POSALN':'numer of positive axillary node'})
num_feat_list.append('number of resected axillary node')
num_feat_list.append('numer of positive axillary node')

In [11]:
df_processed = df_processed.merge(df, on='RUSUBJID', how='left')

In [12]:
with SAS7BDAT('death.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()
df['target_label'] = np.ones(len(df))
df_processed = df_processed.merge(df[['target_label','RUSUBJID']],on='RUSUBJID',how='left')
df_processed['target_label'] = df_processed['target_label'].fillna(0)

In [13]:
with SAS7BDAT('diag3.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()
df = df.rename(columns={'PTSIZE':'primary tumor size'})
df_processed = df_processed.merge(df[['primary tumor size', 'RUSUBJID']],on='RUSUBJID',how='left')
num_feat_list.append('primary tumor size')

In [14]:
with SAS7BDAT('hormrec.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()

In [15]:
df['estrogen receptor positive'] = df['ERSTA'].apply(lambda x: 1 if x == 'Positive' else 0)
df['progesterone receptor positive'] = df['PGRSTA'].apply(lambda x: 1 if x == 'Positive' else 0)
df = df[['estrogen receptor positive','progesterone receptor positive','RUSUBJID']].groupby('RUSUBJID').max().reset_index()

In [16]:
df[['estrogen receptor positive','progesterone receptor positive']] = df[['estrogen receptor positive','progesterone receptor positive']].replace({1:'Yes',0:'No'})
df_processed = df_processed.merge(df,on='RUSUBJID',how='left')
binary_feat_list.append('estrogen receptor positive')
binary_feat_list.append('progesterone receptor positive')

In [17]:
df_processed[binary_feat_list] = df_processed[binary_feat_list].fillna('No')

In [18]:
with SAS7BDAT('vital.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()

In [19]:
df = df.rename(columns={'HT':'height','WT': 'weight'})
df = df[['weight','height','RUSUBJID']].groupby('RUSUBJID').max().reset_index()
df['weight'] = df['weight'].replace({'A':np.nan,'> 125':np.nan, 'I':np.nan, '> 275':np.nan})
df = df.fillna(df.median())
df_processed = df_processed.merge(df, on='RUSUBJID',how='left')
num_feat_list.append('weight')
num_feat_list.append('height')

  df = df.fillna(df.median())


In [20]:
df_processed[num_feat_list] = df_processed[num_feat_list].fillna(df_processed[num_feat_list].median())

In [21]:
df_processed.drop('RUSUBJID',axis=1).to_csv('NCT00312208.csv')
with open('numerical_feature.txt','w') as f:
    for x in num_feat_list:
        f.write(x+'\n')
with open('binary_feature.txt','w') as f:
    for x in binary_feat_list:
        f.write(x+'\n')

In [22]:
df_processed

Unnamed: 0,age,RUSUBJID,adverse effect: febrile neutropenia,adverse effect: infection (documented clinically),adverse effect: infection without neutropenia(specify),adverse effect: vomiting,adverse effect: nausea,anti-tumor therapy: xeloda,anti-tumor therapy: taxotere,anti-tumor therapy: arimidex,anti-tumor therapy: zoladex,anti-tumor therapy: cyclophosphamide,number of resected axillary node,numer of positive axillary node,target_label,primary tumor size,estrogen receptor positive,progesterone receptor positive,weight,height
0,43.0,000301-000-901-000,No,No,No,No,No,No,No,No,No,No,21.0,3.0,0.0,3.0,No,No,95.0,160.0
1,39.0,000301-000-901-001,Yes,No,No,No,No,Yes,No,No,No,No,25.0,18.0,1.0,4.0,No,No,48.5,147.0
2,53.0,000301-000-901-002,Yes,No,No,No,No,No,No,No,No,No,29.0,28.0,1.0,2.7,No,No,65.0,164.0
3,48.0,000301-000-901-003,Yes,No,No,No,No,No,No,No,No,No,19.0,8.0,0.0,2.0,Yes,Yes,52.0,157.0
4,66.0,000301-000-901-004,No,No,No,No,No,No,No,No,No,No,16.0,2.0,0.0,0.9,Yes,No,80.5,69.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1646,68.0,000301-000-999-995,No,No,No,No,No,No,No,No,No,No,19.0,2.0,0.0,1.5,Yes,Yes,58.0,146.0
1647,38.0,000301-000-999-996,Yes,No,No,No,No,No,No,No,No,No,27.0,1.0,0.0,3.0,Yes,No,78.0,158.0
1648,41.0,000301-000-999-997,No,No,No,No,No,No,No,No,No,No,18.0,3.0,1.0,3.5,No,No,62.0,165.0
1649,50.0,000301-000-999-998,No,No,No,No,No,No,No,No,No,No,16.0,8.0,0.0,0.5,Yes,Yes,62.0,167.0
