In [1]:
import pickle
import pandas as pd
import numpy as np
import csv
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, KFold, GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
pd.set_option('display.max_columns', 5000)

## Random forests for diagnose/procedure coding system evaluation

In [2]:
eva = 'roc_auc'

In [3]:
param_grid = {"n_estimators": [500],
                    "max_features": ["sqrt", "log2"]}

In [4]:
gs_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=3,
                       scoring=eva)

In [5]:
rf_clf = RandomForestClassifier(n_estimators = 10)

#### Decide the best number of variables

In [6]:
n_variables = [10, 20, 40, 80, 160, 320, 640, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000]

In [7]:
def n_variable_selection(var_name, n_variables):
    best_score = 0
    best_n = 0
    path = '../../../data/hcup/nis/all_year_combination/' + var_name.lower() + '.pickle'
    data = pd.read_pickle(path)
    X = data.filter(regex = var_name).values
    y = data.DIED.values
    names = data.filter(regex = var_name).columns.tolist()
    rf_clf.fit(X,y)
    d = {'names': names, 'importance': rf_clf.feature_importances_}
    df = pd.DataFrame(data = d).sort_values(by = ['importance'], ascending = False)
    for n in n_variables:
        if n <= len(names):
            print(n)
            selected_names = df.iloc[0:n].names.tolist()
            x = data[selected_names].values
            new_score = np.mean(cross_validate(rf_clf, x, y, cv = 3, return_train_score=False, scoring = eva)['test_score'])
            print('score: %.6f, best score: %.6f' %(new_score, best_score))
            if new_score > best_score:
                best_n = n
                best_score = new_score
                with open('../tests/feature_selection/' + var_name.lower() + '.pkl', 'wb') as f:
                    pickle.dump(selected_names, f)
    return best_score, best_n

In [8]:
n_variable_selection('DX', n_variables)

10
score: 0.851278, best score: 0.000000
20
score: 0.770032, best score: 0.851278
40
score: 0.796432, best score: 0.851278
80
score: 0.823613, best score: 0.851278
160
score: 0.832228, best score: 0.851278
320
score: 0.835947, best score: 0.851278
640
score: 0.834768, best score: 0.851278
1000
score: 0.837324, best score: 0.851278
1500
score: 0.830878, best score: 0.851278
2000
score: 0.830664, best score: 0.851278
2500
score: 0.830730, best score: 0.851278
3000
score: 0.832669, best score: 0.851278
3500
score: 0.831255, best score: 0.851278
4000
score: 0.833486, best score: 0.851278
4500
score: 0.834037, best score: 0.851278
5000
score: 0.832470, best score: 0.851278
5500
score: 0.832773, best score: 0.851278
6000
score: 0.836116, best score: 0.851278


(0.8512779988655526, 10)

In [18]:
n_variable_selection('DXCCS', n_variables)

10
score: 0.873568, best score: 0.000000
20
score: 0.784707, best score: 0.873568
40
score: 0.829978, best score: 0.873568
80
score: 0.836891, best score: 0.873568
160
score: 0.831999, best score: 0.873568


(0.873567513771076, 10)

In [19]:
n_variable_selection('APRDRG', n_variables)

10
score: 0.762840, best score: 0.000000
20
score: 0.784034, best score: 0.762840
40
score: 0.807873, best score: 0.784034
80
score: 0.852703, best score: 0.807873
160
score: 0.852563, best score: 0.852703
320
score: 0.852009, best score: 0.852703
640
score: 0.818913, best score: 0.852703
1000
score: 0.818894, best score: 0.852703
1500
score: 0.818825, best score: 0.852703


(0.852703272752111, 80)

In [20]:
n_variable_selection('DRG', n_variables)

10
score: 0.729857, best score: 0.000000
20
score: 0.754265, best score: 0.729857
40
score: 0.781972, best score: 0.754265
80
score: 0.791635, best score: 0.781972
160
score: 0.793962, best score: 0.791635
320
score: 0.761699, best score: 0.793962
640
score: 0.761213, best score: 0.793962


(0.7939620655138034, 160)

In [21]:
n_variable_selection('MDC', n_variables)

10
score: 0.693621, best score: 0.000000
20
score: 0.692855, best score: 0.693621


(0.6936205137806063, 10)

In [22]:
n_variable_selection('PR', n_variables)

10
score: 0.768206, best score: 0.000000
20
score: 0.763855, best score: 0.768206
40
score: 0.766910, best score: 0.768206
80
score: 0.761691, best score: 0.768206
160
score: 0.763485, best score: 0.768206
320
score: 0.765924, best score: 0.768206
640
score: 0.766651, best score: 0.768206
1000
score: 0.764226, best score: 0.768206
1500
score: 0.762697, best score: 0.768206


(0.7682055760681994, 10)

In [23]:
n_variable_selection('PRCCS', n_variables)

10
score: 0.765511, best score: 0.000000
20
score: 0.768102, best score: 0.765511
40
score: 0.757898, best score: 0.768102
80
score: 0.757135, best score: 0.768102
160
score: 0.759164, best score: 0.768102


(0.7681018135522689, 20)

In [24]:
def para_tune_dxpx(var_name):
    path = '../../../data/hcup/nis/all_year_combination/' + var_name.lower() + '.pickle'
    data = pd.read_pickle(path)
    with open('../tests/feature_selection/' + var_name.lower() + '.pkl', 'rb') as f:
        names = pickle.load(f)
    X = data[names]
    y = data.DIED.values
    print(X.shape)
    print('parameter tuning...')
    gs_rf.fit(X,y)
    print(gs_rf.best_params_)

In [25]:
para_tune_dxpx('DX')

(138932, 10)
parameter tuning...
{'max_features': 'log2', 'n_estimators': 500}


In [26]:
para_tune_dxpx('DXCCS')

(138932, 10)
parameter tuning...
{'max_features': 'sqrt', 'n_estimators': 500}


In [27]:
para_tune_dxpx('DRG')

(138932, 160)
parameter tuning...
{'max_features': 'sqrt', 'n_estimators': 500}


In [28]:
para_tune_dxpx('APRDRG')

(138932, 80)
parameter tuning...
{'max_features': 'sqrt', 'n_estimators': 500}


In [29]:
para_tune_dxpx('MDC')

(138932, 10)
parameter tuning...
{'max_features': 'log2', 'n_estimators': 500}


In [30]:
para_tune_dxpx('PR')

(138932, 10)
parameter tuning...
{'max_features': 'sqrt', 'n_estimators': 500}


In [31]:
para_tune_dxpx('PRCCS')

(138932, 20)
parameter tuning...
{'max_features': 'log2', 'n_estimators': 500}


In [32]:
para_tune_dxpx('PCLASS')

(138932, 4)
parameter tuning...
{'max_features': 'sqrt', 'n_estimators': 500}
