In [1]:
from sas7bdat import SAS7BDAT
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder
random.seed(1234)
np.random.seed(1234)

In [2]:
with SAS7BDAT('adsl.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()

In [3]:
df['age'] = df['AGE'].apply(lambda x: x.split('-')[0]).replace({'>=80':'80'}).astype(int)
df['age'] = MinMaxScaler().fit_transform(df['age'].values.reshape(-1,1))

In [4]:
df['ECOG performance score'] = df['ECOGSCR']
df[['height','weight']] = MinMaxScaler().fit_transform(df[['HTCM','WTKG']])
df['target_label'] = df['DCSREAS'].apply(lambda x: 1 if x =='DEATH' else 0)
df_processed = df[['USUBJID','age','ECOG performance score','SEX','height','weight','target_label']]

In [5]:
with SAS7BDAT('adcm.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()

In [6]:
drug_list = [d for d in df['CMDECOD'].value_counts()[:10].index.tolist() if len(d) > 0]
# df['baseline_status'] = df['CMPRIOR'].apply(lambda x: 1 if x == 'Yes' else 0)
for drug in drug_list:
    df[drug] = np.zeros(len(df))
    # df.loc[(df['CMDECOD']==drug) & (df['baseline_status']==1), drug] = 1
    df.loc[(df['CMDECOD']==drug), drug] = 1

In [7]:
df_drug = df[drug_list+['USUBJID']].groupby('USUBJID').max().reset_index()

In [8]:
df_processed = df_processed.merge(df_drug,on='USUBJID')

In [9]:
with SAS7BDAT('admh.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()

In [10]:
dz_list=[dz for dz in df['MHDECOD'].value_counts()[:10].index.tolist() if len(dz)>0]
for dz in dz_list:
    df[dz]=np.zeros(len(df))
    df.loc[df['MHDECOD']==dz,dz]=1
df_dz=df[dz_list+['USUBJID']].groupby('USUBJID').max().reset_index()
df_processed=df_processed.merge(df_dz,on='USUBJID',how='outer').fillna(0)

In [11]:
df_processed = df_processed.rename(columns={'USUBJID':'patient_id'})
df_processed['SEX'] = df_processed['SEX'].apply(lambda x: 0 if x == 'M' else 1)

In [12]:
df_processed.to_csv('NCT03041311.csv')

In [14]:
df_processed.columns

Index(['patient_id', 'age', 'ECOG performance score', 'SEX', 'height',
       'weight', 'target_label', 'DEXAMETHASONE', 'ONDANSETRON', 'PREDNISONE',
       'SODIUM CHLORIDE', 'FAMOTIDINE', 'DIPHENHYDRAMINE HYDROCHLORIDE',
       'PARACETAMOL', 'LORAZEPAM', 'ONDANSETRON HYDROCHLORIDE', 'Hypertension',
       'Chronic obstructive pulmonary disease', 'Cough', 'Depression',
       'Fatigue', 'Hypothyroidism', 'Dyspnoea', 'Anxiety', 'Nephrolithiasis',
       'Weight decreased'],
      dtype='object')

Index(['patient_id', 'age', 'ECOG performance score', 'SEX', 'height',
       'weight', 'target_label', 'DEXAMETHASONE', 'ONDANSETRON', 'PREDNISONE',
       'SODIUM CHLORIDE', 'FAMOTIDINE', 'DIPHENHYDRAMINE HYDROCHLORIDE',
       'PARACETAMOL', 'LORAZEPAM', 'ONDANSETRON HYDROCHLORIDE', 'Hypertension',
       'Chronic obstructive pulmonary disease', 'Cough', 'Depression',
       'Fatigue', 'Hypothyroidism', 'Dyspnoea', 'Anxiety', 'Nephrolithiasis',
       'Weight decreased'],
      dtype='object')

In [15]:
numerical_features = ['height','weight','age']
f = open('NCT03041311_numerical_feature.txt','w')
for x in numerical_features: f.write(x+'\n')
f.close()