## 2.2.2 Feature Selection Methods (Gradient Boosting)

## Pearson Correlation

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def pearson_selection(X, y, num_of_feat):
    fs = SelectKBest(score_func=f_regression, k=num_of_feat)
    fit = fs.fit(X, y)

    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Best_columns', 'Score_pearson']

    lyst = featureScores.nlargest(num_of_feat, 'Score_pearson')

    pearson_features = list(lyst['Best_columns'])
    return pearson_features

# Load your dataset
df = pd.read_csv("feature_vectors_syscallsbinders_frequency_5_Cat.csv")
target = "Class"

X = df.loc[:, df.columns != target]
#X = X.loc[:, X.columns != "NAME"]
y = df[target]

# Iterate over different numbers of features
for i in range(47, len(X.columns), 47):
    print("Number of columns: ", i)
    pearson_list = pearson_selection(X, y, i)

    X_ = df.loc[:, pearson_list]
    y_ = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=236, max_depth=11, random_state=0)
    gbc.fit(X_train, y_train)
    
    y_pred_test = gbc.predict(X_test)
    print("Testing Accuracy: ", accuracy_score(y_test, y_pred_test))
    
    y_pred_train = gbc.predict(X_train)
    print("Training Accuracy: ", accuracy_score(y_train, y_pred_train))
    print("___________________________________________________________________________________________")


Number of columns:  47
Testing Accuracy:  0.9301724137931034
Training Accuracy:  0.9977365811597327
___________________________________________________________________________________________
Number of columns:  94
Testing Accuracy:  0.9387931034482758
Training Accuracy:  0.9984910541064884
___________________________________________________________________________________________
Number of columns:  141
Testing Accuracy:  0.9469827586206897
Training Accuracy:  0.998598835956025
___________________________________________________________________________________________
Number of columns:  188
Testing Accuracy:  0.9431034482758621
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  235
Testing Accuracy:  0.9461206896551724
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  282
Testing Accura

## Mutual Information

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def mutual_info_selection(X, y, num_of_feat):
    fs = SelectKBest(score_func=mutual_info_classif, k=num_of_feat)
    fit = fs.fit(X, y)

    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Best_columns', 'Score_mutual_info']

    lyst = featureScores.nlargest(num_of_feat, 'Score_mutual_info')

    mutual_info_features = list(lyst['Best_columns'])
    return mutual_info_features

# Load your dataset
df = pd.read_csv("feature_vectors_syscallsbinders_frequency_5_Cat.csv")
target = "Class"

X = df.loc[:, df.columns != target]
# X = X.loc[:, X.columns != "NAME"]
y = df[target]

# Iterate over different numbers of features
for i in range(47, len(X.columns), 47):
    print("Number of columns: ", i)
    mutual_info_list = mutual_info_selection(X, y, i)

    X_ = df.loc[:, mutual_info_list]
    y_ = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=236, max_depth=11, random_state=0)
    gbc.fit(X_train, y_train)
    
    y_pred_test = gbc.predict(X_test)
    print("Testing Accuracy: ", accuracy_score(y_test, y_pred_test))
    
    y_pred_train = gbc.predict(X_train)
    print("Training Accuracy: ", accuracy_score(y_train, y_pred_train))
    print("___________________________________________________________________________________________")



Number of columns:  47
Testing Accuracy:  0.9357758620689656
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  94
Testing Accuracy:  0.9452586206896552
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  141
Testing Accuracy:  0.9456896551724138
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  188
Testing Accuracy:  0.9469827586206897
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  235
Testing Accuracy:  0.9508620689655173
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  282
Testing Accur

## Recursive Feature Elimination 

In [3]:
'''
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

def rfe_selection(X, y, num_of_feat):
    model = LogisticRegression(max_iter=1000)
    rfe = RFE(model, n_features_to_select=num_of_feat)
    fit = rfe.fit(X, y)

    feature_ranks = pd.DataFrame(fit.ranking_, index=X.columns, columns=['Rank'])
    selected_features = feature_ranks[feature_ranks['Rank'] == 1].index.tolist()
    
    return selected_features

# Load your dataset
df = pd.read_csv("permission-based_malware_2.csv")
target = "CLASS"

X = df.loc[:, df.columns != target]
X = X.loc[:, X.columns != "NAME"]
y = df[target]

# Iterate over different numbers of features
for i in range(50, len(X.columns), 50):
    print("Number of columns: ", i)
    rfe_list = rfe_selection(X, y, i)

    X_ = df.loc[:, rfe_list]
    y_ = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=186, max_depth=6, random_state=0)
    gbc.fit(X_train, y_train)
    
    y_pred_test = gbc.predict(X_test)
    print("Testing Accuracy: ", accuracy_score(y_test, y_pred_test))
    
    y_pred_train = gbc.predict(X_train)
    print("Training Accuracy: ", accuracy_score(y_train, y_pred_train))
    print("___________________________________________________________________________________________")
'''

'\nimport numpy as np\nimport pandas as pd\nfrom sklearn.feature_selection import RFE\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\n\ndef rfe_selection(X, y, num_of_feat):\n    model = LogisticRegression(max_iter=1000)\n    rfe = RFE(model, n_features_to_select=num_of_feat)\n    fit = rfe.fit(X, y)\n\n    feature_ranks = pd.DataFrame(fit.ranking_, index=X.columns, columns=[\'Rank\'])\n    selected_features = feature_ranks[feature_ranks[\'Rank\'] == 1].index.tolist()\n    \n    return selected_features\n\n# Load your dataset\ndf = pd.read_csv("permission-based_malware_2.csv")\ntarget = "CLASS"\n\nX = df.loc[:, df.columns != target]\nX = X.loc[:, X.columns != "NAME"]\ny = df[target]\n\n# Iterate over different numbers of features\nfor i in range(50, len(X.columns), 50):\n    print("Number of columns: ", i)\n    rfe_list = rfe_s

## Chi-square Test

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def chi2_selection(X, y, num_of_feat):
    fs = SelectKBest(score_func=chi2, k=num_of_feat)
    fit = fs.fit(X, y)

    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Best_columns', 'Score_chi2']

    lyst = featureScores.nlargest(num_of_feat, 'Score_chi2')

    chi2_features = list(lyst['Best_columns'])
    return chi2_features

# Load your dataset
df = pd.read_csv("feature_vectors_syscallsbinders_frequency_5_Cat.csv")
target = "Class"

X = df.loc[:, df.columns != target]
#X = X.loc[:, X.columns != "NAME"]
y = df[target]

# Iterate over different numbers of features
for i in range(47, len(X.columns), 47):
    print("Number of columns: ", i)
    chi2_list = chi2_selection(X, y, i)

    X_ = df.loc[:, chi2_list]
    y_ = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=236, max_depth=11, random_state=0)
    gbc.fit(X_train, y_train)
    
    y_pred_test = gbc.predict(X_test)
    print("Testing Accuracy: ", accuracy_score(y_test, y_pred_test))
    
    y_pred_train = gbc.predict(X_train)
    print("Training Accuracy: ", accuracy_score(y_train, y_pred_train))
    print("___________________________________________________________________________________________")


Number of columns:  47
Testing Accuracy:  0.944396551724138
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  94
Testing Accuracy:  0.9474137931034483
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  141
Testing Accuracy:  0.944396551724138
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  188
Testing Accuracy:  0.9461206896551724
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  235
Testing Accuracy:  0.95
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  282
Testing Accuracy:  0.95086206

## ANOVA F-test

In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def anova_selection(X, y, num_of_feat):
    fs = SelectKBest(score_func=f_classif, k=num_of_feat)
    fit = fs.fit(X, y)

    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Best_columns', 'Score_anova']

    lyst = featureScores.nlargest(num_of_feat, 'Score_anova')

    anova_features = list(lyst['Best_columns'])
    return anova_features

# Load your dataset
df = pd.read_csv("feature_vectors_syscallsbinders_frequency_5_Cat.csv")
target = "Class"

X = df.loc[:, df.columns != target]
# X = X.loc[:, X.columns != "NAME"]
y = df[target]

# Iterate over different numbers of features
for i in range(47, len(X.columns), 47):
    print("Number of columns: ", i)
    anova_list = anova_selection(X, y, i)

    X_ = df.loc[:, anova_list]
    y_ = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=236, max_depth=11, random_state=0)
    gbc.fit(X_train, y_train)
    
    y_pred_test = gbc.predict(X_test)
    print("Testing Accuracy: ", accuracy_score(y_test, y_pred_test))
    
    y_pred_train = gbc.predict(X_train)
    print("Training Accuracy: ", accuracy_score(y_train, y_pred_train))
    print("___________________________________________________________________________________________")



Number of columns:  47
Testing Accuracy:  0.9271551724137931
Training Accuracy:  0.9952575986203923
___________________________________________________________________________________________
Number of columns:  94
Testing Accuracy:  0.9387931034482758
Training Accuracy:  0.9978443630092693
___________________________________________________________________________________________
Number of columns:  141
Testing Accuracy:  0.9487068965517241
Training Accuracy:  0.998598835956025
___________________________________________________________________________________________
Number of columns:  188
Testing Accuracy:  0.9482758620689655
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  235
Testing Accuracy:  0.9431034482758621
Training Accuracy:  0.9987066178055616
___________________________________________________________________________________________
Number of columns:  282
Testing Accura