In [1]:
from sas7bdat import SAS7BDAT
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

random.seed(1234)
np.random.seed(1234)

In [2]:
with SAS7BDAT('c9732_demographic.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()

In [3]:
# df = df.assign(age_group=pd.cut(df['AGE'], bins=[0, 30, 40, 50, 60, 70, 999], labels=['1','2','3','4','5','6']))
df['age'] = MinMaxScaler().fit_transform(df['AGE'].values.reshape(-1,1))
df = df.assign(chemo_cycle_group=pd.cut(df['CHEMO_CYCLE'], bins=[0, 5, 10, 999], labels=['1','2','3',]))
df['RACE'].fillna(99, inplace=True)
df['STATUS'].replace({3:1}, inplace=True)
df['STATUS'].replace({1:0, 2:1}, inplace=True)

In [4]:
columns = [
    'PHATOM_ID',
    'GENDER',
    'age',
    'RACE',
    'PS',
    'NUM_META',
    'chemo_cycle_group',
    'TRT_ARM',
    'STATUS',
]
rename_dict = {
    'PHATOM_ID': 'patient_id',
    'GENDER': 'gender',
    'age': 'age',
    'RACE': 'race',
    'PS': 'ECOG performance tatus',
    'NUM_META': 'num_metastatic',
    'chemo_cycle_group': 'chemotherapy cycle group',
    'TRT_ARM':'treatment arm',
    'STATUS': 'target_label',
}
df_processed = df[columns].rename(columns=rename_dict)

In [5]:
# check nan
df_processed[df_processed.isnull().any(axis=1)]

Unnamed: 0,patient_id,gender,age,race,ECOG performance tatus,num_metastatic,chemotherapy cycle group,treatment arm,target_label
19,19815.0,2.0,0.655172,5.0,,,,2.0,0.0
53,24875.0,2.0,0.706897,5.0,0.0,2.0,,1.0,0.0
56,25768.0,1.0,0.62069,5.0,1.0,4.0,,1.0,0.0
66,26868.0,2.0,0.672414,5.0,1.0,1.0,,1.0,0.0
105,32841.0,1.0,0.551724,5.0,1.0,2.0,,2.0,0.0
119,35701.0,1.0,0.793103,5.0,,,,1.0,1.0
141,38746.0,2.0,0.896552,5.0,,2.0,2.0,1.0,1.0
160,39880.0,2.0,0.62069,5.0,,,,1.0,1.0
166,40779.0,1.0,0.810345,5.0,1.0,3.0,,2.0,1.0
184,43730.0,2.0,0.655172,3.0,1.0,3.0,,2.0,1.0


In [6]:
df_processed.fillna(method='bfill', inplace=True)
df_processed['race'] = df_processed['race'].apply(lambda x: 1 if x == 5 else 0)
df_processed['gender'] = df_processed['gender'] - 1

df_processed.to_csv('NCT00003299.csv')

In [7]:
df_processed

Unnamed: 0,patient_id,gender,age,race,ECOG performance tatus,num_metastatic,chemotherapy cycle group,treatment arm,target_label
0,12727.0,1.0,0.551724,1,0.0,2.0,1,1.0,1.0
1,13700.0,1.0,0.655172,1,0.0,2.0,2,2.0,1.0
2,14768.0,0.0,0.655172,1,1.0,1.0,2,1.0,1.0
3,14795.0,0.0,0.293103,1,1.0,1.0,2,1.0,1.0
4,16718.0,1.0,0.603448,1,1.0,5.0,2,2.0,1.0
...,...,...,...,...,...,...,...,...,...
582,99637.0,1.0,0.741379,1,0.0,3.0,2,2.0,1.0
583,99683.0,1.0,0.206897,1,1.0,1.0,2,1.0,1.0
584,99701.0,1.0,0.879310,1,0.0,1.0,2,2.0,1.0
585,99711.0,0.0,0.741379,0,1.0,0.0,2,2.0,1.0


In [8]:
df_processed.columns

Index(['patient_id', 'gender', 'age', 'race', 'ECOG performance tatus',
       'num_metastatic', 'chemotherapy cycle group', 'treatment arm',
       'target_label'],
      dtype='object')

In [9]:
numerical_features = ['age']
f = open('NCT00003299_numerical_feature.txt','w')
for x in numerical_features: f.write(x+'\n')
f.close()

# Apply ML models for prediction

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
target = df_processed["target_label"]
train_dataset, test_dataset, y_train, y_test = train_test_split(df_processed,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=0,
                                                        stratify=target)
x_train = train_dataset.drop(['target_label'], axis=1)
x_test = test_dataset.drop(['target_label'], axis=1)
clf = RandomForestClassifier(n_estimators=200, max_depth=6, class_weight='balanced', random_state=0)
clf.fit(x_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=6, n_estimators=200,
                       random_state=0)

In [11]:
ypred_prob = clf.predict_proba(x_test)
auc = roc_auc_score(y_test, ypred_prob[:,1])
print('test auc is', auc)

test auc is 0.6782496782496783
