In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
from numpy import mean

from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel, RFECV, RFE
from sklearn.ensemble import RandomForestClassifier

import warnings

In [None]:
from DataPreparation import DataPreparation

In [None]:
pics = './images'

In [None]:
df = pd.read_csv('./data/balanced_dataframe.csv', index_col=None)
df.head()

In [None]:
X = df.drop(columns=['label'])
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=y
                                                    )
y_test.value_counts()

In [None]:
data_preparation = DataPreparation(X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = data_preparation.clean_dataset()
feats = data_preparation.feature_names
X_train.shape

## Feature selection

### Feature selection using Random Forest Classifier

In [None]:
Rtree_clf = RandomForestClassifier(random_state=42)
Rtree_clf = Rtree_clf.fit(X_train,y_train)
model = SelectFromModel(Rtree_clf, prefit=True)
RF_tree_featuresTrain = X_train.loc[:, model.get_support()]
RF_tree_featuresTest = X_test.loc[:, model.get_support()]
warnings.filterwarnings('ignore')

importances = Rtree_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in Rtree_clf.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
indices = indices[:200]

In [None]:
top_20_rf = feats[indices[:20]]
top_20_rf

In [None]:
feature_scores = pd.Series(Rtree_clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
f, ax = plt.subplots(figsize=(10, 10))
ax = sns.barplot(x=importances[indices][:20], y=top_20_rf)
ax.set_title("Top 20 features with the most importnace score from RFC")
ax.set_xlabel("Feature importance score")
plt.autoscale()
# plt.show()
plt.savefig(f'{pics}/rf_feat_score.png')

In [None]:
plt.figure(1, figsize=(10, 10))
plt.title("Feature importances")
plt.xlabel("# of Features ")
plt.ylabel("Importance Score")
plt.bar(range(len(importances[indices])), importances[indices], color="r", yerr=std[indices], align="center")
plt.xlim([0, len(importances[indices])])
plt.show()
# plt.savefig(f'{pics}/rf_feat_import.png')

### 1.13.3. Recursive Feature Elmination

In [None]:
clf = RandomForestClassifier(n_estimators=50, max_depth=20)
min_features_to_select = 10
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=-1,
)

# size = 6050
size = X_train.shape[0]
rfecv.fit(X_train[:size], y_train[:size])
print('Optimal number of features :', rfecv.n_features_)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
n_scores = len(rfecv.cv_results_["mean_test_score"])
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    range(min_features_to_select, n_scores + min_features_to_select),
    rfecv.cv_results_["mean_test_score"],
    yerr=rfecv.cv_results_["std_test_score"],
)
plt.title("Recursive Feature Elimination \nwith correlated features")
# plt.show()
plt.savefig(f'{pics}/rfe_with_corr_feat.png')

### Applying RFE with optimal number of features found in RFECV

In [None]:
clf = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=42)
rfe = RFE(estimator=clf, n_features_to_select=rfecv.n_features_, step=1)
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe_train = pd.DataFrame(X_train).loc[:, rfe.get_support()]
rfe_test = pd.DataFrame(X_test).loc[:, rfe.get_support()]

print("Train Accuracy:",cross_val_score(clf,rfe_train,y_train, cv=5).mean())

clf = clf.fit(rfe_train, y_train)
y_pred = clf.predict(rfe_test)
print()

print(classification_report(y_test, y_pred))

In [None]:
rfe_feats = feats[rfe.get_support()]
rfe_feats

In [None]:
importance = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in Rtree_clf.estimators_],axis=0)
forest_importances = pd.Series(importance, index=rfe_feats)
top_20_rfe = forest_importances.sort_values(ascending=False)[:20]
top_20_rfe

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.barplot(x=top_20_rfe, y=top_20_rfe.index)
ax.set_title("Top 20 features with the most importance score from RFECV (RFC)")
ax.set_xlabel("Mean decrease in impurity")
fig.tight_layout()
# plt.show()
plt.savefig(f'{pics}/rfe_feat_score.png')

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16,5))
fig.suptitle("Top 20 features with the most importance score")
ax1 = sns.barplot(x=importances[indices][:20], y=top_20_rf, ax=ax1)
ax1.plot()
ax1.set_title("RFC")
ax2 = sns.barplot(x=top_20_rfe, y=top_20_rfe.index, ax=ax2)
ax2.set_title("RFECV")
ax2.plot()
plt.autoscale()
fig.tight_layout()
# plt.show()
plt.savefig(f'{pics}/feat_score_comb.png')