In [33]:
#Manipulação de dados
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np

# Métricas de classificação
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

#Ignorar Erros
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

#Regressão
from sklearn.linear_model import LogisticRegression

#Split de treino e teste
from sklearn.model_selection import train_test_split

#Seleção de feature
from sklearn.feature_selection import VarianceThreshold

## Funções Auxiliares

In [21]:
def fit_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

In [36]:
def evaluate_model(name, model, X_test, y_test):
    """
        Mostra a Acurácia, F1 e a matrix de confusão para um modelo
    """
    pred_test = model.predict(X_test)
    
    print(f"\n===== {name} =====\n")
    print("   Acurácia (test) :", "{:.4f}".format(accuracy_score(y_test, pred_test)))
    print("   F1       (test) :", "{:.4f}".format(f1_score(y_test, pred_test)))
    
    #confusion_matrix(y_test, pred_test)

## Preparação do dataset

In [22]:
#Leitura dos dados
dataset = pd.read_csv("dataset/PowerShellCorpus.ast.csv")

#Definindo X e y
y = dataset.Label
X = dataset.drop(['Path', 'Label'], axis=1)

#Definindo treino e teste
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1234)
dataset.shape

(14702, 5000)

## Removendo Features com baixa variância

In [23]:
lv = VarianceThreshold()
_ = lv.fit(X_train)
mask = lv.get_support()

In [24]:
print(f"Total de Features                           : {len(mask)}")
print(f"Features com variância diferente de zero    : {sum(mask)}")
print(f"Features com variância zero (removidas)     : {len(mask) - sum(mask)}")

Total de Features                           : 4998
Features com variância diferente de zero    : 3463
Features com variância zero (removidas)     : 1535


In [25]:
X_train_non_zero_variance = X_train.loc[:, mask]
X_test_non_zero_variance = X_test.loc[:, mask]

In [26]:
X_train_non_zero_variance.head()

Unnamed: 0,AstArrayElementMetrics_CharacterDistribution_!_21_Count,AstArrayElementMetrics_CharacterDistribution_!_21_Percent,"AstArrayElementMetrics_CharacterDistribution__22_Count""","AstArrayElementMetrics_CharacterDistribution__22_Percent""",AstArrayElementMetrics_CharacterDistribution_#_23_Count,AstArrayElementMetrics_CharacterDistribution_#_23_Percent,AstArrayElementMetrics_CharacterDistribution_$_24_Count,AstArrayElementMetrics_CharacterDistribution_$_24_Percent,AstArrayElementMetrics_CharacterDistribution_%_25_Count,AstArrayElementMetrics_CharacterDistribution_%_25_Percent,AstArrayElementMetrics_CharacterDistribution_&_26_Count,AstArrayElementMetrics_CharacterDistribution_&_26_Percent,AstArrayElementMetrics_CharacterDistribution_(_28_Count,AstArrayElementMetrics_CharacterDistribution_(_28_Percent,AstArrayElementMetrics_CharacterDistribution_)_29_Count,AstArrayElementMetrics_CharacterDistribution_)_29_Percent,AstArrayElementMetrics_CharacterDistribution_*_2a_Count,AstArrayElementMetrics_CharacterDistribution_*_2a_Percent,"AstArrayElementMetrics_CharacterDistribution_,_2c_Count","AstArrayElementMetrics_CharacterDistribution_,_2c_Percent",AstArrayElementMetrics_CharacterDistribution_._2e_Count,AstArrayElementMetrics_CharacterDistribution_._2e_Percent,AstArrayElementMetrics_CharacterDistribution_/_2f_Count,AstArrayElementMetrics_CharacterDistribution_/_2f_Percent,AstArrayElementMetrics_CharacterDistribution_:_3a_Count,AstArrayElementMetrics_CharacterDistribution_:_3a_Percent,AstArrayElementMetrics_CharacterDistribution_;_3b_Count,AstArrayElementMetrics_CharacterDistribution_;_3b_Percent,AstArrayElementMetrics_CharacterDistribution_?_3f_Count,AstArrayElementMetrics_CharacterDistribution_?_3f_Percent,AstArrayElementMetrics_CharacterDistribution_@_40_Count,AstArrayElementMetrics_CharacterDistribution_@_40_Percent,AstArrayElementMetrics_CharacterDistribution_[_5b_Count,AstArrayElementMetrics_CharacterDistribution_[_5b_Percent,AstArrayElementMetrics_CharacterDistribution_\_5c_Count,AstArrayElementMetrics_CharacterDistribution_\_5c_Percent,AstArrayElementMetrics_CharacterDistribution_]_5d_Count,AstArrayElementMetrics_CharacterDistribution_]_5d_Percent,AstArrayElementMetrics_CharacterDistribution_^_5e_Count,AstArrayElementMetrics_CharacterDistribution_^_5e_Percent,AstArrayElementMetrics_CharacterDistribution___5f_Count,AstArrayElementMetrics_CharacterDistribution___5f_Percent,AstArrayElementMetrics_CharacterDistribution_'_27_Count,AstArrayElementMetrics_CharacterDistribution_'_27_Percent,AstArrayElementMetrics_CharacterDistribution_-_2d_Count,AstArrayElementMetrics_CharacterDistribution_-_2d_Percent,AstArrayElementMetrics_CharacterDistribution_`_60_Count,AstArrayElementMetrics_CharacterDistribution_`_60_Percent,AstArrayElementMetrics_CharacterDistribution_{_7b_Count,AstArrayElementMetrics_CharacterDistribution_{_7b_Percent,...,AstVariableNameMetrics_CharacterDistribution_u_75_Count,AstVariableNameMetrics_CharacterDistribution_u_75_Percent,AstVariableNameMetrics_CharacterDistribution_UNKNOWN_UNICODE_Count,AstVariableNameMetrics_CharacterDistribution_UNKNOWN_UNICODE_Percent,AstVariableNameMetrics_CharacterDistribution_V_56_Count,AstVariableNameMetrics_CharacterDistribution_V_56_Percent,AstVariableNameMetrics_CharacterDistribution_v_76_Count,AstVariableNameMetrics_CharacterDistribution_v_76_Percent,AstVariableNameMetrics_CharacterDistribution_W_57_Count,AstVariableNameMetrics_CharacterDistribution_W_57_Percent,AstVariableNameMetrics_CharacterDistribution_w_77_Count,AstVariableNameMetrics_CharacterDistribution_w_77_Percent,AstVariableNameMetrics_CharacterDistribution_X_58_Count,AstVariableNameMetrics_CharacterDistribution_X_58_Percent,AstVariableNameMetrics_CharacterDistribution_x_78_Count,AstVariableNameMetrics_CharacterDistribution_x_78_Percent,AstVariableNameMetrics_CharacterDistribution_Y_59_Count,AstVariableNameMetrics_CharacterDistribution_Y_59_Percent,AstVariableNameMetrics_CharacterDistribution_y_79_Count,AstVariableNameMetrics_CharacterDistribution_y_79_Percent,AstVariableNameMetrics_CharacterDistribution_Z_5a_Count,AstVariableNameMetrics_CharacterDistribution_Z_5a_Percent,AstVariableNameMetrics_CharacterDistribution_z_7a_Count,AstVariableNameMetrics_CharacterDistribution_z_7a_Percent,AstVariableNameMetrics_Count,AstVariableNameMetrics_Density_Average,AstVariableNameMetrics_Density_Maximum,AstVariableNameMetrics_Density_Median,AstVariableNameMetrics_Density_Minimum,AstVariableNameMetrics_Density_Mode,AstVariableNameMetrics_Density_Range,AstVariableNameMetrics_Entropy_Average,AstVariableNameMetrics_Entropy_Maximum,AstVariableNameMetrics_Entropy_Median,AstVariableNameMetrics_Entropy_Minimum,AstVariableNameMetrics_Entropy_Mode,AstVariableNameMetrics_Entropy_Range,AstVariableNameMetrics_Length_Average,AstVariableNameMetrics_Length_Maximum,AstVariableNameMetrics_Length_Median,AstVariableNameMetrics_Length_Minimum,AstVariableNameMetrics_Length_Mode,AstVariableNameMetrics_Length_Range,AstVariableNameMetrics_Length_Total,AstVariableNameMetrics_UpperAlphaPercent_Average,AstVariableNameMetrics_UpperAlphaPercent_Maximum,AstVariableNameMetrics_UpperAlphaPercent_Median,AstVariableNameMetrics_UpperAlphaPercent_Minimum,AstVariableNameMetrics_UpperAlphaPercent_Mode,AstVariableNameMetrics_UpperAlphaPercent_Range
1925,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.062,1,0.062,0,0.0,215,13.412,13,0.811,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,480,29.944,10,0.624,0,0.0,0,0.0,...,16,2.548,0,0.0,1,0.159,1,0.159,0,0.0,10,1.592,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,74,1.0,1,1.0,1.0,1.0,0.0,2.708,3.75,2.75,0.0,2.75,3.75,8.486,16,8,1,12,15,628,0.192,1.0,0.167,0.0,0.167,1.0
8322,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.205,1,1.205,0,0.0,5,6.024,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,7,8.434,0,0.0,7,8.434,0,0.0,0,0.0,6,7.229,0,0.0,0,0.0,0,0.0,...,0,0.0,0,0.0,0,0.0,1,5.882,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,1,1.0,1.0,1.0,0.0,3.22,3.22,3.22,3.22,3.22,0.0,17.0,17,17,17,17,0,17,0.294,0.294,0.294,0.294,0.294,0.0
713,0,0.0,4,2.041,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,4,2.041,0,0.0,0,0.0,6,3.061,0,0.0,0,0.0,0,0.0,0,0.0,16,8.163,0,0.0,0,0.0,0,0.0,8,4.082,0,0.0,0,0.0,0,0.0,...,23,4.197,0,0.0,13,2.372,2,0.365,4,0.73,11,2.007,0,0.0,0,0.0,0,0.0,22,4.015,0,0.0,0,0.0,73,1.0,1,1.0,1.0,1.0,0.0,2.338,3.75,2.522,0.0,2.522,3.75,7.507,17,7,1,7,16,548,0.239,1.0,0.25,0.0,0.286,1.0
9407,0,0.0,12,19.355,0,0.0,2,3.226,0,0.0,0,0.0,0,0.0,0,0.0,3,4.839,4,6.452,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,12,19.355,0,0.0,0,0.0,0,0.0,0,0.0,3,4.839,0,0.0,2,3.226,...,3,1.172,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,6,2.344,0,0.0,0,0.0,0,0.0,0,0.0,62,1.0,1,1.0,1.0,1.0,0.0,1.845,3.203,1.585,1.5,1.585,1.703,4.129,16,4,3,4,13,256,0.054,0.125,0.0,0.0,0.0,0.125
10874,0,0.0,2,2.941,0,0.0,3,4.412,0,0.0,0,0.0,1,1.471,1,1.471,0,0.0,1,1.471,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,2,2.941,0,0.0,0,0.0,...,4,1.064,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,50,1.0,1,1.0,1.0,1.0,0.0,2.572,3.392,2.75,1.5,2.75,1.892,7.52,18,8,4,8,14,376,0.156,0.25,0.167,0.0,0.25,0.25


## Regressão logística

In [34]:
model = fit_model(LogisticRegression(random_state=0), X_train_non_zero_variance, y_train)

In [37]:
evaluate_model("Regressão Logística", model, X_test_non_zero_variance, y_test)


===== Regressão Logística =====

   Acurácia (test) : 0.9402
   F1       (test) : 0.9363
