# Load the 3 Database dataset 
# Create new dataframe with only the top 20 salient features found previouslous and try to find the top important features

In [21]:
%run __init__.py
%matplotlib inline

In [2]:
db_1 = pd.read_pickle('./Datasets/database_1.p')
db_2 = pd.read_pickle('./Datasets/database_2.p')
db_3 = pd.read_pickle('./Datasets/database_3.p')
# top 20 features found 
db_top_20 =  ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336', 'feat_341', 
                   'feat_395', 'feat_504', 'feat_526', 'feat_639', 'feat_681', 'feat_701', 
                   'feat_724', 'feat_736', 'feat_769', 'feat_808', 'feat_829', 'feat_867',
                   'feat_920', 'feat_956']

#create X and y dataframes from samplesets 
db_y_1 = db_1['target']
db_x_1 = db_1[db_top_20]
db_y_2 = db_2['target']
db_x_2 = db_2[db_top_20]
db_y_3 = db_3['target']
db_x_3 = db_3[db_top_20]

# Find top 5 feats from sample datasets

In [3]:
def skb_5_feats(x, y):
    X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=.2, 
                                                    random_state=42)
    skb_list = []
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    skb = SelectKBest(k=5, score_func=f_regression)
    skb.fit(X_train_scaled, y_train)
    
    skb_feats = x.columns[skb.get_support()]
    
    skb_list.append(skb_feats)
    
    return skb_list

In [10]:
# find top 5 features from each sample set
db_1 = skb_5_feats(db_x_1, db_y_1)
db_2 = skb_5_feats(db_x_2, db_y_2)
db_3 = skb_5_feats(db_x_3, db_y_3)

print(np.sort(db_1))
print(np.sort(db_2))
print(np.sort(db_3))

[['feat_269' 'feat_341' 'feat_681' 'feat_701' 'feat_920']]
[['feat_269' 'feat_341' 'feat_681' 'feat_701' 'feat_920']]
[['feat_269' 'feat_341' 'feat_681' 'feat_701' 'feat_920']]


SKB of 3 samplesets show consistent top 5 features 

### Top 5 features of SKB from sample sets are not the same, try RFE

In [8]:
def rfe_5_feats(x, y, estimator = DecisionTreeClassifier(max_depth=10)):
    
    X_train, X_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=.2, 
                                                    random_state=42)
    
    rfe_list = []
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    rfe = RFE(estimator = estimator, n_features_to_select=5)
    rfe.fit(X_train_scaled, y_train)
    
    rfe_feats = x.columns[rfe.get_support()]
    rfe_list.append(rfe_feats)
    
    return rfe_list

In [9]:
db_1_rfe = rfe_5_feats(db_x_1, db_y_1)
db_2_rfe = rfe_5_feats(db_x_2, db_y_2)
db_3_rfe = rfe_5_feats(db_x_3, db_y_3)

print(np.sort(db_1_rfe))
print(np.sort(db_2_rfe))
print(np.sort(db_3_rfe))

[['feat_269' 'feat_639' 'feat_769' 'feat_808' 'feat_920']]
[['feat_269' 'feat_724' 'feat_736' 'feat_769' 'feat_829']]
[['feat_269' 'feat_736' 'feat_769' 'feat_808' 'feat_829']]


Top features from RFE overlap with some of SKB features but are not the exact same 5 features

### Try to find important features with Randomforest pipeline

In [11]:
from sklearn.pipeline import Pipeline

In [16]:
def feature_importance(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
    
    
    rf_pipe = Pipeline([('scaler',StandardScaler()),
    ('clf',RandomForestClassifier(random_state=42))])
    
    rfparams = {
    'clf__n_estimators':[10,50],
    'clf__max_features':['auto','log2']}
    
    rfgs = GridSearchCV(rf_pipe, rfparams, cv=5, n_jobs=-1)
    
    rfgs.fit(X_train, y_train)
    
    important_features = rfgs.best_estimator_.named_steps['clf']
    
    return important_features

In [18]:
db_1 = feature_importance(db_x_1, db_y_1)

In [19]:
db_1

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [40]:
db_1 = feature_importance(db_x_1, db_y_1)

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(db_x_1.columns, db_1.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

importances.sort_values(['Gini-importance'], ascending=False).head(5)

Unnamed: 0,Gini-importance
feat_269,0.063983
feat_681,0.059348
feat_920,0.056407
feat_808,0.055638
feat_395,0.054439


In [41]:
db_2 = feature_importance(db_x_2, db_y_2)

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(db_x_2.columns, db_2.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances_2 = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

importances_2.sort_values(['Gini-importance'], ascending=False).head(5)

Unnamed: 0,Gini-importance
feat_269,0.061722
feat_808,0.058546
feat_920,0.057916
feat_681,0.056911
feat_724,0.054921


In [42]:
db_3 = feature_importance(db_x_3, db_y_3)

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(db_x_3.columns, db_3.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances_3 = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

importances_3.sort_values(['Gini-importance'], ascending=False).head(5)

Unnamed: 0,Gini-importance
feat_269,0.061508
feat_920,0.060861
feat_681,0.060241
feat_308,0.055306
feat_701,0.055205


While there are some overlap of features with the other methods, results are still inconclusive of which 5 features are the most important, therefore, I'm keeping all 20 salient features in model creation.