# **Humana Competition**

## Feature Importance Ranking


Example: random forest feature importance
https://mljar.com/blog/feature-importance-in-random-forest/


Example: xgboost feature importance的例子 
https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/


In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt

### Split Data into training and test

In [51]:
X = cleaned_training.drop(columns=['id', 'hi_flag']) # all features other than id and hi_flag
y = cleaned_training["hi_flag"] # hi_flag
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [52]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36225 entries, 8854 to 14155
Columns: 681 entries, rx_hum_19_pmpm_cost to cms_race_cd_6
dtypes: float64(612), int64(37), uint8(32)
memory usage: 180.7 MB


### Note: Deal with Imbalances

In [53]:
[sum(y_train.values == 0), sum(y_train.values != 0)] 

[34643, 1582]

In [54]:
[sum(y_test.values == 0), sum(y_test.values != 0)] 
# the proportion of 1 and 0 are similar in the test and train set

[11539, 536]

### Random Forest

In [55]:
from imblearn.ensemble import BalancedRandomForestClassifier

BRFC = BalancedRandomForestClassifier(n_estimators = 150, random_state = 2, class_weight = {0: 34643, 1:1582})
BRFC.fit(X_train, y_train)

y_pred = BRFC.predict_proba(X_test)[:, 1] 

roc_auc_score(y_test, y_pred)

0.6842965549667384

In [None]:
# what are less important features to avoid overfitting? 
less_importance = X_train.columns[BRFC.feature_importances_ == 0]
importances = BRFC.feature_importances_

sorted_indices = np.argsort(importances)[::-1]

# print(*X_train.columns[sorted_indices], sep = "\n")

In [None]:
important_features_dict = {}
for idx, val in enumerate(BRFC.feature_importances_):
    important_features_dict[idx] = val

important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)
important1 = important_features_list[:500]
print(f'500 most important features: {important_features_list[:500]}')

500 most important features: [330, 97, 271, 164, 433, 485, 183, 586, 644, 357, 289, 286, 439, 519, 309, 620, 431, 647, 316, 536, 560, 297, 12, 386, 561, 472, 170, 648, 20, 356, 402, 199, 614, 9, 75, 491, 145, 306, 451, 618, 368, 287, 265, 305, 310, 121, 315, 216, 200, 630, 273, 354, 360, 326, 161, 281, 13, 551, 118, 86, 14, 621, 82, 596, 505, 516, 162, 191, 106, 21, 34, 260, 404, 220, 209, 292, 352, 339, 219, 450, 269, 425, 126, 249, 434, 50, 120, 405, 304, 221, 129, 500, 604, 313, 568, 550, 320, 240, 638, 85, 19, 369, 119, 373, 457, 194, 51, 45, 130, 480, 285, 482, 166, 426, 184, 460, 376, 455, 168, 197, 635, 160, 595, 71, 28, 136, 531, 26, 329, 43, 603, 153, 81, 63, 530, 251, 308, 3, 573, 8, 481, 294, 580, 83, 232, 489, 585, 637, 323, 242, 396, 139, 606, 365, 375, 643, 528, 151, 575, 245, 157, 223, 332, 142, 22, 23, 272, 348, 203, 350, 198, 201, 27, 148, 80, 334, 88, 300, 241, 338, 529, 390, 333, 178, 179, 616, 372, 533, 437, 30, 512, 243, 165, 341, 353, 527, 282, 206, 303, 583, 393,

In [None]:
#choose top n important feature
X_train_new = X_train.iloc[:,important1]
len(X_train_new)
X_test_new = X_test.iloc[:,important1]
BRFC2 = BalancedRandomForestClassifier(n_estimators = 1000, random_state = 42, class_weight = {0: 34643, 1:1582})
BRFC2.fit(X_train_new, y_train)
y_pred2 = BRFC2.predict_proba(X_test_new)[:, 1]
roc_auc_score(y_test, y_pred2)

0.6855582560376038