In [None]:
import pandas as pd
import numpy as np
import pickle as pkl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


### Load the dataset

In [None]:
data = pd.read_csv('../Master.csv', index_col = 0)
feature_names = pkl.load(open("./action_features.pkl", 'rb'))
X = data[feature_names]
y = data['actiontype']

### Forward feature selections results

In [None]:
report_dict = {}
for _ in range(1, len(feature_names)+1):    
    print (_)
    X_et = df[names[:_]]
    X_norm_et = MinMaxScaler().fit_transform(X_et)
    X_train, X_test, y_train, y_test = train_test_split(X_norm_et, y, test_size=0.30, random_state=42, stratify = y)
    clf = ExtraTreesClassifier(random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    report = classification_report(y_test, y_pred, output_dict=True)
    
    cm = confusion_matrix(y_test, y_pred)
    tp = cm[0][0]
    fp = cm[0][1]
    fn = cm[1][0]
    tn = cm[1][1]
    
    report_dict[_] = {
        '0_precision': report['0']['precision'],
        '0_recall': report['0']['recall'],
        '0_f1-score': report['0']['f1-score'],
        '1_precision': report['1']['precision'],
        '1_recall': report['1']['recall'],
        '1_f1-score': report['1']['f1-score'],
        'accuracy': report['accuracy'],
        'macro_avg_precision': report['macro avg']['precision'],
        'macro_avg_recall': report['macro avg']['recall'],
        'macro_avg_f1-score': report['macro avg']['f1-score'],
        'weighted_avg_precision': report['weighted avg']['precision'],
        'weighted_avg_recall': report['weighted avg']['recall'],
        'weighted_avg_f1-score': report['weighted avg']['f1-score'],
        'tp':tp,
        'tn':tn,
        'fp':fp,
        'fn':fn,
    }
    pd.DataFrame(report_dict).transpose().to_csv("./action_convergence.csv")        
    print ("#"*100)

### Plotting the forward convergence plot

In [None]:
convergence_df = pd.read_csv('./action_convergence.csv')
ax = sns.lineplot(x = 'feature', y = '1_f1-score',data = convergence_df)
ax.set_title('Forward Feature Selection')
ax.set(xlabel='Number Of Features', ylabel='Malicious F1-score')
plt.axvline(10)
ax.figure.savefig("Forward_Feature_Selection.png")

### False Omission Rate and False Positive Rate with features in forward feature selection

* False Positive Rate

In [None]:
convergence_df['fpr'] = convergence_df['fp']/(convergence_df['tn'] + convergence_df['fp'])

ax = sns.lineplot(x='feature', y='fpr',data=convergence_df)
ax.set_title('Malware Detector')
ax.set(xlabel='Number Of Features', ylabel='False Positive Rate')
plt.xlim(0,50)
plt.grid()
ax.figure.savefig('./FPR.png')

* False Omission Rate

In [None]:
convergence_df['for'] = convergence_df['fn']/(convergence_df['tn'] + convergence_df['fn'])

ax = sns.lineplot(x = 'feature', y = 'for', data = convergence_df)
ax.set_title('Malware Detector')
ax.set(xlabel='Number Of Features', ylabel = 'False Omission Rate')
plt.xlim(0, 50)
plt.ylim(0, 0.5)
plt.grid()
ax.figure.savefig('./FOR.png')