In [1]:
#CELDA 1: Imports y rutas

import sys
import random
from pathlib import Path

%run 00_setup.ipynb

#RUTAS
from src.config.rutas import BASE_DIR

import pandas as pd
import joblib

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

from src.evasion.perturbaciones_carac import aplicar_grado_perturbaciones
from src.evasion.ofuscacion_urls import aplicar_grado_ofuscacion
from src.evasion.parafraseo_semantico import aplicar_variantes_semanticas
from src.evasion.template_mixing import generar_template_mixing

In [2]:
#CELDA 2: Cargar vectorizador, modelos y test

MODELS_DIR = BASE_DIR / "models"

tfidf = joblib.load(MODELS_DIR / "tfidf_vect_v1.joblib")

svm = joblib.load(MODELS_DIR / "modelo_svm_v1.joblib")
mlp = joblib.load(MODELS_DIR / "modelo_mlp_v1.joblib")
ens = joblib.load(MODELS_DIR / "modelo_ens_v1.joblib")

dt_test = pd.read_csv(MODELS_DIR / "test_v1.csv")

print("Listo")
dt_test.head()

Listo


Unnamed: 0,id,vector,label,texto_preprocesado
0,11523,email,0,philipp thomas phdsusede david c rankin vhdicq...
1,6289,url,1,https://pzabenltniu04.z21.web.core.windows.net/
2,11817,url,0,https://www.kerrylibrary.ie
3,15441,url,1,http://www.chengxinjiameng.com
4,714,sms,0,gudnite....tc...practice going on


In [3]:
#CELDA 3: Metricas antes de modificar con evasion

x_test = tfidf.transform(dt_test["texto_preprocesado"].to_list())
y_test = dt_test["label"].to_numpy()

def evaluar_modelos(nombre, modelo, x_test, y_test, exist_prob: bool = False):

    y_pred = modelo.predict(x_test)

    metricas = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
        "recall": recall_score(y_test, y_pred, average="macro", zero_division=0),
        "f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),
        "f1_weighted": f1_score(y_test, y_pred, average="weighted", zero_division=0) 
    }

    if exist_prob == True:
        y_prob = modelo.predict_proba(x_test)[:, 1]
        metricas["roc_auc"] = roc_auc_score(y_test, y_prob)

    print(f"\n.: {nombre} :.")
    print(f"Accuracy:   {metricas['accuracy']:.4f}")
    print(f"Precision:   {metricas['precision']:.4f}")
    print(f"Recall:   {metricas['recall']:.4f}")
    print(f"F1_macro:   {metricas['f1_macro']:.4f}")
    print(f"F1_weighted:   {metricas['f1_weighted']:.4f}")

    if exist_prob:
        print(f"ROC_AUC:   {metricas['roc_auc']:.4f}")
    else:
        print("No aplicable al ser modelo sin probabilidades")

    print(f"\nReport de clasificación {nombre}:\n")
    print(classification_report(y_test, y_pred, zero_division=0))

    print("\nMatriz de confusion:\n")
    print(confusion_matrix(y_test, y_pred))

    return metricas


In [4]:
#CELDA 4: Definicion de la funcion que aplica variantes evasivas

def generar_variantes_evasivas(row, nivel: str = "medio", prob_semantica = 0.3, prob_template = 0.6, prob_perturbacion=0.8):

    texto = row["texto_preprocesado"]
    v = row["vector"].lower()

    #Para muestras de tipo URL
    if v == "url":
        return aplicar_grado_ofuscacion(texto, nivel=nivel) if random.random() < 0.9 else texto
    
    t = texto

    if random.random() < prob_perturbacion:
        t = aplicar_grado_perturbaciones(texto, nivel=nivel)
    
    if random.random() < prob_template:
        t = generar_template_mixing(texto, nivel=nivel)

    if random.random() < prob_semantica:
        t = aplicar_variantes_semanticas(texto, metodo="back_translation")

    return t

In [5]:
#CELDA 5: Generar variantes evasivas

random.seed(42)

dt_test_evasivo = dt_test.copy()

dt_test_evasivo["texto_evasivo"] = dt_test_evasivo["texto_preprocesado"]

mascara = dt_test_evasivo["label"] == 1          #Se seleccionan los textos maliciosos del test para apllicar las variantes evasivas
dt_mal = dt_test_evasivo.loc[mascara].copy()

NUM_MUESTRAS_SEM = 50
semanticas = dt_mal.sample(n=min(NUM_MUESTRAS_SEM, len(dt_mal)), random_state=42).index

def try_except_variantes_evasivas(r):

    try:
        prob_sem = 0.30 if r.name in semanticas else 0.0

        return generar_variantes_evasivas(               
                r, 
                nivel="fuerte",
                prob_semantica=prob_sem,                            #Se escoge una probabilidad baja para la semantica para no saturar la CPU con el back_translation
                prob_template=0.70,
                prob_perturbacion=0.90
        )             
    except Exception:
        return r["texto_preprocesado"]
    
dt_test_evasivo.loc[mascara, "texto_evasivo"] = dt_test_evasivo.loc[mascara].apply(try_except_variantes_evasivas, axis=1)

dt_test_evasivo.loc[mascara, ["vector", "texto_preprocesado", "texto_evasivo"]].head(10)

Device set to use cpu
Device set to use cpu
Your input_length: 383 is bigger than 0.9 * max_length: 256. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 248 is bigger than 0.9 * max_length: 256. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


Unnamed: 0,vector,texto_preprocesado,texto_evasivo
1,url,https://pzabenltniu04.z21.web.core.windows.net/,https://pzabenltnniu04.z21.web.core.windows.net/
3,url,http://www.chengxinjiameng.com,http://support.ww.chengxinjiameng.com
8,sms,dear paytm customer your paytm kyc has expired...,"Estimado cliente,\n\nSe trata de un aviso gene..."
9,sms,important information 4 orange user 0796xxxxxx...,Nos ponemos en contacto con usted para informa...
11,email,penls enlargement pllls un 2 u 6,Nos ponemos en contacto con usted para informa...
13,url,https://www.nodeunblocker.net,https://seguro.www.nodeunblocker.net
14,url,https://authsecuritecle.web.app/,https://authsecuritecl.eweb.app/
16,url,https://yahoo-108039.weeblysite.com/,https://academy.yahoo-108039.weeblysiite.com/
19,email,condition heard would curious massive discount...,"Dear customer,\n\nSe trata de un aviso generad..."
21,email,nothing like hot pennystock apwl sixth hello c...,"Hello,\n\nThis is an automatic notification fr..."


In [6]:
#CELDA 5: Evaluacion sobre variantes evasivas

y_test = dt_test["label"].to_numpy()

x_test_base = tfidf.transform(dt_test["texto_preprocesado"].to_list())
x_test_evasivo = tfidf.transform(dt_test_evasivo["texto_evasivo"].to_list())

print("Resultados sin evasion\n")
base_svm = evaluar_modelos("SVM", svm, x_test_base, y_test, exist_prob=True)

base_mlp = evaluar_modelos("MLP", mlp, x_test_base, y_test, exist_prob=True)

base_ens = evaluar_modelos("Ensemble", ens, x_test_base, y_test, exist_prob=True)


print("Resultados con evasion\n")
evasivo_svm = evaluar_modelos("SVM (Evasivo)", svm, x_test_evasivo, y_test, exist_prob=True)

evasivo_mlp = evaluar_modelos("MLP (Evasivo)", mlp, x_test_evasivo, y_test, exist_prob=True)

evasivo_ens = evaluar_modelos("Ensemble (Evasivo)", ens, x_test_evasivo, y_test, exist_prob=True)


Resultados sin evasion


.: SVM :.
Accuracy:   0.9746
Precision:   0.9744
Recall:   0.9740
F1_macro:   0.9742
F1_weighted:   0.9746
ROC_AUC:   0.9934

Report de clasificación SVM:

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1800
           1       0.97      0.97      0.97      1426

    accuracy                           0.97      3226
   macro avg       0.97      0.97      0.97      3226
weighted avg       0.97      0.97      0.97      3226


Matriz de confusion:

[[1762   38]
 [  44 1382]]

.: MLP :.
Accuracy:   0.9740
Precision:   0.9740
Recall:   0.9732
F1_macro:   0.9736
F1_weighted:   0.9740
ROC_AUC:   0.9938

Report de clasificación MLP:

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1800
           1       0.97      0.97      0.97      1426

    accuracy                           0.97      3226
   macro avg       0.97      0.97      0.97      3226
weighted avg 

In [7]:
#CELDA 6: Tabla comparativa incial sin entrenamiento

comparativa = pd.DataFrame([
    {"Modelo": "SVM", "Escenario": "Base", **base_svm},
    {"Modelo": "SVM", "Escenario": "Evasivo", **evasivo_svm},

    {"Modelo": "MLP", "Escenario": "Base", **base_mlp},
    {"Modelo": "MLP", "Escenario": "Evasivo", **evasivo_mlp},

    {"Modelo": "Ensemble", "Escenario": "Base", **base_ens},
    {"Modelo": "Ensemble", "Escenario": "Evasivo", **evasivo_ens}
])

cols_orden = ["Modelo", "Escenario", "accuracy", "precision", "recall", "f1_macro", "f1_weighted", "roc_auc"]
comparativa = comparativa.reindex(columns=[c for c in cols_orden if c in comparativa.columns])

cols_numericas = ["accuracy", "precision", "recall", "f1_macro", "f1_weighted", "roc_auc"]
for c in cols_numericas:
    if c in comparativa.columns:
        comparativa[c] = pd.to_numeric(comparativa[c], errors="coerce")         #Se fuerza a numerico las columnas y los que no se pueda a valores NaN

comparativa.style.format({c: "{:.4f}" for  c in cols_numericas if c in comparativa.columns}, na_rep="-")

Unnamed: 0,Modelo,Escenario,accuracy,precision,recall,f1_macro,f1_weighted,roc_auc
0,SVM,Base,0.9746,0.9744,0.974,0.9742,0.9746,0.9934
1,SVM,Evasivo,0.9619,0.9632,0.9596,0.9612,0.9618,0.9944
2,MLP,Base,0.974,0.974,0.9732,0.9736,0.974,0.9938
3,MLP,Evasivo,0.9631,0.9644,0.961,0.9625,0.963,0.9902
4,Ensemble,Base,0.9777,0.9775,0.9772,0.9774,0.9777,0.9942
5,Ensemble,Evasivo,0.9631,0.9647,0.9608,0.9625,0.963,0.9949


In [8]:
#CELDA 7: Guardar resultados comparativa

OUT_PATH = MODELS_DIR / "metricas_comparativas_iniciales_v1.csv"
comparativa.to_csv(OUT_PATH, index=False)