# Data processing

In [5]:
!pip install sas7bdat
from sas7bdat import SAS7BDAT
import random
import numpy as np
random.seed(1234)
np.random.seed(1234)
with SAS7BDAT('all_finalb.sas7bdat', skip_header=False) as reader:
    df = reader.to_data_frame()
df['survstat'].value_counts()

You should consider upgrading via the 'D:\python\python.exe -m pip install --upgrade pip' command.
[all_finalb.sas7bdat] header length 65536 != 8192
[all_finalb.sas7bdat] header length 65536 != 8192
[all_finalb.sas7bdat] header length 65536 != 8192




0.0    3605
1.0     266
Name: survstat, dtype: int64

In [6]:
columns = [
    'RACE_ID',
    'stra1',
    'stra3',
    'indrx',
    'OH002',
    'OH003',
    'OH004',
    'OH005',
    'OH011',
    'OH016',
    'OH027',
    'OH036',
    'OH037',
    'num_pos_nodes',
    'tsize',
    'survstat', # 0: alive, 1: dead
]

In [7]:
df

Unnamed: 0,mask_id,SSTAT,ETHNIC_ID,scase,GROUP_ID,RACE_ID,indrx,stra1,stra2,stra3,...,cod,agecat,preamend,survmos,event,dfsstat,dfsmos,agent,length,num_pos_nodes
0,1.0,7.0,1.0,10.0,37.0,1.0,4.0,1.0,1.0,2.0,...,0.0,3.0,0.0,60.944559,,0.0,60.944559,1.0,1.0,0
1,2.0,7.0,2.0,10.0,37.0,1.0,1.0,2.0,1.0,2.0,...,0.0,4.0,0.0,30.422998,,0.0,30.422998,0.0,0.0,0
2,3.0,7.0,9.0,10.0,1.0,3.0,1.0,1.0,2.0,3.0,...,0.0,3.0,0.0,79.310062,,0.0,79.310062,0.0,0.0,0
3,4.0,7.0,2.0,10.0,37.0,1.0,1.0,1.0,1.0,3.0,...,0.0,4.0,0.0,82.529774,,0.0,81.642710,0.0,0.0,0
4,5.0,7.0,2.0,10.0,1.0,1.0,3.0,2.0,1.0,2.0,...,0.0,4.0,0.0,31.605749,,0.0,25.363450,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3866,3867.0,7.0,2.0,10.0,37.0,1.0,1.0,1.0,2.0,3.0,...,0.0,2.0,0.0,95.901437,,0.0,91.466119,0.0,0.0,0
3867,3868.0,7.0,2.0,10.0,1.0,1.0,4.0,2.0,2.0,3.0,...,0.0,3.0,1.0,107.071869,2.0,1.0,60.484600,1.0,1.0,0
3868,3869.0,7.0,2.0,10.0,37.0,1.0,2.0,2.0,2.0,2.0,...,0.0,5.0,0.0,74.743326,,0.0,74.086242,0.0,1.0,0
3869,3870.0,7.0,2.0,10.0,1.0,1.0,3.0,2.0,2.0,3.0,...,0.0,3.0,1.0,110.882957,,0.0,110.882957,1.0,0.0,0


In [8]:
rename_dict = {
    'RACE_ID': 'race',
    'stra1': 'post-menopause',
    'stra3': 'human epidermal growth factor receptor 2 is positive',
    'indrx': 'treatment',
    'OH002': 'tumor laterality',
    'OH003': 'estrogen receptor positive',
    'OH004': 'progesterone receptor positive',
    'OH005': 'cancer histologic grade',
    'OH011': 'prior hormonal therapy',
    'OH016': 'prior chemotherapy',
    'OH027': 'biopsy type',
    'OH036': 'sentinel node biospy',
    'OH037': 'axillary dissection',
    'num_pos_nodes': 'number of positive axillary nodes',
    'tsize': 'tumor size',
    'survstat': 'target_label',
}


In [9]:
df_processed = df[columns].rename(columns=rename_dict)
binary_feat_list = []
num_feat_list = ['number of positive axillary nodes', 'tumor size']

In [10]:
df_processed['race'] = df_processed['race'].apply(lambda x: {1:'White',3:'Black or Asia',4:'Black or Asia',5:'Black or Asia',6:'Black or Asia',99:'Black or Asia'}[x])

In [11]:
df_processed['post-menopause'] = df_processed['post-menopause'].apply(lambda x: {2: 'Yes', 1: 'No',}[x])
binary_feat_list.append('post-menopause')

In [12]:
df_processed['human epidermal growth factor receptor 2 is positive'] = df_processed['human epidermal growth factor receptor 2 is positive'].apply(lambda x: 'Yes' if x == 1 else 'No')
binary_feat_list.append('human epidermal growth factor receptor 2 is positive')

In [13]:
df_processed['treatment'] = df_processed['treatment'].apply(lambda x: {1:'Cyclophosphamide and Doxorubicin', 2: 'Cyclophosphamide and Doxorubicin', 
                                          3: 'paclitaxel' , 4: 'paclitaxel',}[x])

In [14]:
df_processed['tumor laterality'] = df_processed['tumor laterality'].fillna(df_processed['tumor laterality'].median()).apply(lambda x: {1:'left', 2:'right', 3:'bilateral'}[x])

In [15]:
df_processed['estrogen receptor positive'].replace({1:'No', 2:'Yes'}, inplace=True)
df_processed['progesterone receptor positive'].replace({1:'No', 2:'Yes'}, inplace=True)
binary_feat_list.append('estrogen receptor positive')
binary_feat_list.append('progesterone receptor positive')

In [16]:
df_processed['cancer histologic grade'].replace({1:'Low', 2: 'Intermediate', 3: 'High'}, inplace=True)
df_processed['sentinel node biospy'].replace({1:'No', 2:'Yes'},inplace=True)
binary_feat_list.append('sentinel node biospy')
df_processed['axillary dissection'].replace({1:'No', 2:'Yes'},inplace=True)
binary_feat_list.append('axillary dissection')

In [17]:
df_processed['tumor size'].replace({1:2, 2:3.5, 3:5},inplace=True)

In [18]:
df_processed['prior hormonal therapy'].replace({1:'No',2:'Yes'},inplace=True)
df_processed['prior chemotherapy'].replace({1:'No',2:'Yes'},inplace=True)
binary_feat_list.append('prior hormonal therapy')
binary_feat_list.append('prior chemotherapy')

In [19]:
df_processed['biopsy type'].replace({1:'core needle', 2:'incisional', 3:'excisional'}, inplace=True)

In [20]:
df_processed['number of positive axillary nodes'].replace({'>3':'4'}, inplace=True)

In [21]:
df_processed.fillna(method='ffill', inplace=True)
df_processed.to_csv('NCT00041119.csv')

In [22]:
with open('binary_feature.txt', 'w') as f:
    for x in binary_feat_list: f.write(x+'\n')
with open('numerical_feature.txt', 'w') as f:
    for x in num_feat_list: f.write(x+'\n')

# Apply ML models for prediction

In [23]:
df_processed

Unnamed: 0,race,post-menopause,human epidermal growth factor receptor 2 is positive,treatment,tumor laterality,estrogen receptor positive,progesterone receptor positive,cancer histologic grade,prior hormonal therapy,prior chemotherapy,biopsy type,sentinel node biospy,axillary dissection,number of positive axillary nodes,tumor size,target_label
0,White,No,No,paclitaxel,left,Yes,Yes,Intermediate,No,No,core needle,No,No,0,2.0,0.0
1,White,Yes,No,Cyclophosphamide and Doxorubicin,left,Yes,Yes,Intermediate,No,No,excisional,No,Yes,0,2.0,0.0
2,Black or Asia,No,No,Cyclophosphamide and Doxorubicin,left,No,No,High,No,No,core needle,No,Yes,0,3.5,0.0
3,White,No,No,Cyclophosphamide and Doxorubicin,right,Yes,Yes,High,No,No,core needle,No,No,0,3.5,0.0
4,White,Yes,No,paclitaxel,left,Yes,Yes,Intermediate,No,No,core needle,Yes,Yes,1,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3866,White,No,No,Cyclophosphamide and Doxorubicin,right,No,No,High,No,No,core needle,No,No,0,3.5,0.0
3867,White,Yes,No,paclitaxel,right,No,No,High,No,No,core needle,No,Yes,0,3.5,0.0
3868,White,Yes,No,Cyclophosphamide and Doxorubicin,left,No,No,Low,No,No,core needle,No,No,0,2.0,0.0
3869,White,Yes,No,paclitaxel,left,No,No,High,No,No,core needle,No,Yes,0,2.0,0.0


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost

target = df_processed["target_label"]
train_dataset, test_dataset, y_train, y_test = train_test_split(df_processed,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=0,
                                                        stratify=target)
x_train = train_dataset.drop(['target_label'], axis=1)
x_test = test_dataset.drop(['target_label'], axis=1)

# clf = RandomForestClassifier(n_estimators=100, max_depth=6, class_weight='balanced', random_state=0)
# clf = LogisticRegression()
clf = xgboost.XGBClassifier(n_estimators=100, max_depth=8, objective='binary:logistic', use_label_encoder=False)

clf.fit(x_train, y_train)

ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:race, post-menopause, human epidermal growth factor receptor 2 is positive, treatment, tumor laterality, estrogen receptor positive, progesterone receptor positive, cancer histologic grade, prior hormonal therapy, prior chemotherapy, biopsy type, sentinel node biospy, axillary dissection, number of positive axillary nodes

In [None]:
ypred_prob = clf.predict_proba(x_test)
auc = roc_auc_score(y_test, ypred_prob[:,1])
print('test auc is', auc)