In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv('df_final6.csv')

In [2]:
(data.columns)

Index(['year_of_birth', 'gender_concept_id', 'ethnicity_concept_id',
       'race_concept_id', 'death', 'basic_metabolic_panel',
       'compreh_metabolic_panel', 'lipid_panel', 'hepatic_func_panel',
       'Urinalysis_with_microscopy', 'Urinalysis_without_microscopy',
       'Creatine kinase (CK), (CPK); total', 'Creatinine; blood',
       'Hemoglobin; glycosylated (A1C)', 'Iron',
       'Prostate specific antigen (PSA); total',
       'Thyroid stimulating hormone (TSH)',
       'Blood count; complete (CBC), automated (Hgb, Hct, RBC, WBC and platelet count) and automated differential WBC count',
       'Blood count; complete (CBC), automated (Hgb, Hct, RBC, WBC and platelet count)',
       'Prothrombin time',
       'Culture, bacterial; quantitative colony count, urine', '2313814.0',
       'Electrocardiogram, routine ECG with at least 12 leads; tracing only, without interpretation and report',
       '2313816.0', '40482801.0', 'drug_num', 'Walking disability',
       'Diabetic on ins

In [2]:
gerd=data.columns[1:5]
data[gerd] = data[gerd].astype('category')
measurements=data.columns[5:25]
data[measurements] = data[measurements].applymap(lambda x: np.nan if x=='none' else x)
data[measurements]=data[measurements].astype('float')
categories=data.columns[26:34]
data[categories]=data[categories].astype('category')
condition_type=data.columns[35]
data[condition_type]=data[condition_type].astype('category')

In [3]:
x = data.drop(['death'],axis=1)
y = data['death']
X = pd.get_dummies(x)
X=X.fillna(X.median())

In [5]:
templist=[1923,8507,38003564]+[8552]+[np.nan]*20+[25,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,38000230,1]
input_variables=pd.DataFrame([templist],columns=x.columns)
tempdataframe=pd.concat([x,input_variables])
tempdataframe[tempdataframe.columns[1:4]]=tempdataframe[tempdataframe.columns[1:4]].astype('category')
tempdataframe[tempdataframe.columns[25:33]]=tempdataframe[tempdataframe.columns[25:33]].astype('category')
tempdataframe[tempdataframe.columns[34]]=tempdataframe[tempdataframe.columns[34]].astype('category')
finaldataframe=pd.get_dummies(tempdataframe)
finaldataframe=finaldataframe.fillna(finaldataframe.median())
test=finaldataframe.tail(1)

In [4]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [90]:
#X_train = X_train.fillna(X_train.median())
#X_test = X_test.fillna(X_test.median())

In [83]:
#smote
sm = SMOTE()
X_train, y_train = sm.fit_sample(X_train, y_train)

In [8]:
#oversampling
from sklearn.utils import resample

# concatenate our training data back together
training = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_dead = training[training.death==0]
dead = training[training.death==1]

# upsample minority
dead_upsampled = resample(dead,
                          replace=True, # sample with replacement
                          n_samples=len(not_dead) # match number in majority class
                         ) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_dead, dead_upsampled])

# check new class counts
upsampled.death.value_counts()
y_train = upsampled.death
X_train = upsampled.drop('death', axis=1)

In [6]:
#undersampling
from sklearn.utils import resample

# concatenate our training data back together
training = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_dead = training[training.death==0]
dead = training[training.death==1]

# downsample majority
not_dead_downsampled = resample(not_dead,
                                replace = False, # sample without replacement
                                n_samples = len(dead), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_dead_downsampled, dead])

# checking counts
downsampled.death.value_counts()
y_train = downsampled.death
X_train = downsampled.drop('death', axis=1)

In [7]:
classifier = xgb.sklearn.XGBClassifier(nthread=-1, seed=1)
classifier.fit(X_train, y_train)
xgb_pre= classifier.predict(X_test)

In [105]:
import pickle
with open('patient_xgboost.pkl', 'wb') as file:
    pickle.dump(classifier, file)

In [10]:
classifier.predict_proba(test)[0][1]

0.6086226

In [8]:
rfc = RandomForestClassifier(n_estimators=100,class_weight={0:1,1:10}).fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [14]:
accuracy_score(y_test,xgb_pre)

0.5171041174275407

In [58]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [59]:
feature_importances

Unnamed: 0,importance
year_of_birth,0.059270
"Blood count; complete (CBC), automated (Hgb, Hct, RBC, WBC and platelet count) and automated differential WBC count",0.053605
compreh_metabolic_panel,0.047505
lipid_panel,0.044870
Thyroid stimulating hormone (TSH),0.043788
2313816.0,0.043114
Prothrombin time,0.043017
basic_metabolic_panel,0.039913
Hemoglobin; glycosylated (A1C),0.037044
40482801.0,0.036278
