In [None]:
#Manipulação de dados
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import numpy as np

from matplotlib import pyplot as plt
%matplotlib inline 

# Métricas de classificação
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

#Ignorar Erros
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

#Regressão
from sklearn.linear_model import LogisticRegression

#MLP
from sklearn.neural_network import MLPClassifier

#Split de treino e teste
from sklearn.model_selection import train_test_split

#Seleção de feature
from sklearn.feature_selection import VarianceThreshold

## Funções Auxiliares

In [None]:
def fit_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

In [None]:
def display_confusion_matrix(y_test, pred_test):
    """
    Mostra a matriz de confusão.
    """
    #plt.figure()
    classes = ['Normal', 'Obfuscated']
    cm = confusion_matrix(y_test, pred_test)
    
    fig, ax = plt.subplots(figsize=(7.5, 7.5))
    ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center', size='xx-large')
    
    plt.title('Matriz de confusão')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
def evaluate_model(name, model, X_test, y_test):
    """
        Mostra a Acurácia, F1 e a matrix de confusão para um modelo
    """
    pred_test = model.predict(X_test)
    
    print(f"\n===== {name} =====\n")
    print("   Acurácia (test) :", "{:.4f}".format(accuracy_score(y_test, pred_test)))
    print("   F1       (test) :", "{:.4f}".format(f1_score(y_test, pred_test)))
    
    display_confusion_matrix(y_test, pred_test)

## Preparação do dataset

In [None]:
#Leitura dos dados
dataset = pd.read_csv("dataset/PowerShellCorpus.ast.csv")

#Definindo X e y
y = dataset.Label
X = dataset.drop(['Path', 'Label'], axis=1)

#Definindo treino e teste
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1234)
dataset.shape

## Removendo Features com baixa variância

In [None]:
lv = VarianceThreshold()
_ = lv.fit(X_train)
mask = lv.get_support()

In [None]:
print(f"Total de Features                           : {len(mask)}")
print(f"Features com variância diferente de zero    : {sum(mask)}")
print(f"Features com variância zero (removidas)     : {len(mask) - sum(mask)}")

In [None]:
X_train_non_zero_variance = X_train.loc[:, mask]
X_test_non_zero_variance = X_test.loc[:, mask]

In [None]:
X_train_non_zero_variance.head()

## Regressão logística

In [None]:
model1 = fit_model(LogisticRegression(random_state=0), X_train_non_zero_variance, y_train)

In [None]:
model2 = fit_model(LogisticRegression(random_state=0), X_train, y_train)

In [None]:
evaluate_model("Regressão Logística - 1", model1, X_test_non_zero_variance, y_test)

In [None]:
evaluate_model("Regressão Logística - 2", model2, X_test, y_test)

In [None]:
cm = confusion_matrix(y_test, pred_test)

In [None]:
pred_test = model2.predict(X_test)

In [None]:
layer    = 5
neurons  = 20
momentum = 0.9

model3 = fit_model(MLPClassifier(hidden_layer_sizes=(layer, neurons), momentum=momentum, max_iter=5000), X_train_non_zero_variance, y_train)

In [None]:
evaluate_model("MLP", model3, X_test_non_zero_variance, y_test)