In [13]:
#CELDA 1: Configuraciones básicas

import json
import pandas as pd
import numpy as np
import sys
import joblib

from pathlib import Path

from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

from datetime import datetime

%run 00_setup.ipynb

#RUTAS
from src.config.rutas import (
    OUTPUT_DIR,
    RUTA_DATASET,
    RANDOM_STATE
)

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

#Se importan las funciones y rutas creadas en los modulos .py de /feat_aux
from src.feat_aux import (
    preprocesar_dataset,
    dividir_train_test_estratificado,
    construir_tfidf,
    ajustar_y_vectorizar,
    calc_dispersion,
    guardar_meta
)

VERSION = "eva_hard_voting"

In [14]:
#CELDA 2: Carga del dataset

dt = pd.read_csv(RUTA_DATASET)

print("Num. de columnas y filas:")
print(dt.shape)
dt.head()

dt_preprocesado = preprocesar_dataset(dt)       #Se aplica internamente la funcion de limpiar
print("Forma tras preprocesar:", dt_preprocesado.shape)
dt_preprocesado.head()

Num. de columnas y filas:
(16127, 5)
Forma tras preprocesar: (16126, 6)


Unnamed: 0,texto,label,vector,fuente,id,texto_preprocesado
0,Moby Pub Quiz.Win a £100 High Street prize if ...,1,sms,mendeley,0,moby pub quiz.win a 100 high street prize if u...
1,https://www.documenters.org,0,url,phiusiil,1,https://www.documenters.org
2,scan system adware wrongful 6130320948821283 c...,1,email,kaggle_email,2,scan system adware wrongful 6130320948821283 c...
3,mit student thesis writeups hi vince krishna t...,0,email,kaggle_email,3,mit student thesis writeups hi vince krishna t...
4,https://swn9klhal.web.app/,1,url,phiusiil,4,https://swn9klhal.web.app/


In [15]:
#CELDA 3: Division estratificada train/test

dt_train, dt_test = dividir_train_test_estratificado(dt_preprocesado)

print("Train:", dt_train.shape)
print("Test:", dt_test.shape)

print("\nDistribucion por clases (train):")
print(dt_train["label"].value_counts(), "\n")
print(dt_train["label"].value_counts(normalize=True))

print("\nDistribucion por clases (test):")
print(dt_test["label"].value_counts(), "\n")
print(dt_test["label"].value_counts(normalize=True))


print("\nDistribucion por vector (train):")
print(dt_train["vector"].value_counts(), "\n")

print("\nDistribucion por vector (test):")
print(dt_test["vector"].value_counts(), "\n")


Train: (12900, 6)
Test: (3226, 6)

Distribucion por clases (train):
label
0    7200
1    5700
Name: count, dtype: int64 

label
0    0.55814
1    0.44186
Name: proportion, dtype: float64

Distribucion por clases (test):
label
0    1800
1    1426
Name: count, dtype: int64 

label
0    0.557967
1    0.442033
Name: proportion, dtype: float64

Distribucion por vector (train):
vector
url      4800
email    4799
sms      3301
Name: count, dtype: int64 


Distribucion por vector (test):
vector
email    1200
url      1200
sms       826
Name: count, dtype: int64 



In [16]:
#CELDA 4: Contruccion de TF_IDF y posterior ajuste

tfidf = construir_tfidf(
    ngram_range=(1, 2),
    min_dt=5,
    max_dt=0.9,
    max_features=50000
)

x_train, x_test, y_train, y_test = ajustar_y_vectorizar(tfidf, dt_train, dt_test)

print("x_train:", x_train.shape)
print("x_test:", x_test.shape)
print("Etiquetas unicas:", np.unique(y_train))

guardar_meta(tfidf, dt_train, dt_test, OUTPUT_DIR, version=VERSION)

x_train: (12900, 22675)
x_test: (3226, 22675)
Etiquetas unicas: [0 1]

Guardando metadatos en: C:\TFG\models\preprocesamiento_metadata_eva_hard_voting.json
Artefactos guardados
Vectorizador: C:\TFG\models\tfidf_vect_eva_hard_voting.joblib
Train: C:\TFG\models\train_eva_hard_voting.csv
Test: C:\TFG\models\test_eva_hard_voting.csv
Meta: C:\TFG\models\preprocesamiento_metadata_eva_hard_voting.json


In [17]:
#CELDA 5: Dispersion de las matrices

disp_train = calc_dispersion(x_train)
disp_test = calc_dispersion(x_test)

print(f"Dispersion del train: {disp_train:.4f}")
print(f"Dispersion del test: {disp_test:.4f}")

Dispersion del train: 0.9980
Dispersion del test: 0.9981


In [18]:
#CELDA 6: Entrenamiento del modelo SVM:

svm = LinearSVC(
    class_weight="balanced",         #considera el desbalance que hay y penaliza los errores en las clases minoritarias
    random_state=RANDOM_STATE
)

#Se entrena el modelo
svm.fit(x_train, y_train)

#Se genera la prediccion
y_pred_xtest_svm = svm.predict(x_test)

#Metricas
accuracy_svm = accuracy_score(y_test, y_pred_xtest_svm)
precision_svm = precision_score(y_test, y_pred_xtest_svm, average="macro", zero_division=0)
recall_svm = recall_score(y_test, y_pred_xtest_svm, average="macro", zero_division=0)
f1_mac_svm = f1_score(y_test, y_pred_xtest_svm, average="macro", zero_division=0)       #Calcula f1 equitativamente cada clase
f1_wei_svm = f1_score(y_test, y_pred_xtest_svm, average="weighted", zero_division=0)    #Calcula f1 segun la frecuencia real

print(".: SVM :.")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"Precison: {precision_svm:.4f}")
print(f"Recall: {recall_svm:.4f}")
print(f"F1-macro: {f1_mac_svm:.4f}")
print(f"F1-weighted: {f1_wei_svm:.4f}")

#Se muestra un reporte conjunto
print("\nReport de clasificación SVM:\n")
print(classification_report(y_test, y_pred_xtest_svm, zero_division=0))

#Muestra una tabla de predicciones que fueron segun TP, TN, FP, FN
print("\nMatriz de confusion:\n")
print(confusion_matrix(y_test, y_pred_xtest_svm))


.: SVM :.
Accuracy: 0.9768
Precison: 0.9768
Recall: 0.9760
F1-macro: 0.9764
F1-weighted: 0.9767

Report de clasificación SVM:

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1800
           1       0.98      0.97      0.97      1426

    accuracy                           0.98      3226
   macro avg       0.98      0.98      0.98      3226
weighted avg       0.98      0.98      0.98      3226


Matriz de confusion:

[[1768   32]
 [  43 1383]]


In [19]:
#CELDA 7: Entrenamiento del modelo MLP

#Se reduce la dimensionalidad sobre TF-IDF para no saturar el sistema al depender de mucha RAM. De esta manera es reproducible
svd = TruncatedSVD(n_components=300, random_state=RANDOM_STATE)

#Se transforma la matrix dispersa en completa
x_train_reducido = svd.fit_transform(x_train)
x_test_reducido = svd.transform(x_test)

mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),       #Define la estructura de la red neuronal siendo densamente conectada con 128 neuronas en la primera capa y 64 neuronas en la segunda
    activation="relu",                  #Activacion estandar para la mayoria de los modelos
    solver="adam",                      #Se pasa el optimizador mas recomendado para NLP
    random_state=RANDOM_STATE,
    max_iter=40,                        #Se define un maximo de 40 iteraciones para entrenar la red
    early_stopping=True,                #Activa la parada temprana
    n_iter_no_change=3,                 #Si en 3 iteraciones el modelo no mejora la perdida de validacion, se detiene automaticamente.
    verbose=True                        #Imprime la informacion durante el entrenamiento para visualizar resultados de las iteraciones
)

#Se entrena el modelo
mlp.fit(x_train_reducido, y_train)

#Se genera la prediccion
y_pred_xtest_mlp = mlp.predict(x_test_reducido)
y_prob_xtest_mlp = mlp.predict_proba(x_test_reducido) [:, 1]

#Metricas
accuracy_mlp = accuracy_score(y_test, y_pred_xtest_mlp)
precision_mlp = precision_score(y_test, y_pred_xtest_mlp, average="macro", zero_division=0)
recall_mlp = recall_score(y_test, y_pred_xtest_mlp, average="macro", zero_division=0)
f1_mac_mlp = f1_score(y_test, y_pred_xtest_mlp, average="macro", zero_division=0)       #Calcula f1 equitativamente cada clase
f1_wei_mlp = f1_score(y_test, y_pred_xtest_mlp, average="weighted", zero_division=0)    #Calcula f1 segun la frecuanecia real
auc_mlp = roc_auc_score(y_test, y_prob_xtest_mlp)

print("\n.: MLP :.")
print(f"Accuracy: {accuracy_mlp:.4f}")
print(f"Precison: {precision_mlp:.4f}")
print(f"Recall: {recall_mlp:.4f}")
print(f"F1-macro: {f1_mac_mlp:.4f}")
print(f"F1-weighted: {f1_wei_mlp:.4f}")
print(f"AUC-ROC: {auc_mlp:.4f}")

print("\nReport de clasificación MLP:\n")
print(classification_report(y_test, y_pred_xtest_mlp, zero_division=0))

print("\nMatriz de confusion:\n")
print(confusion_matrix(y_test, y_pred_xtest_mlp))


Iteration 1, loss = 0.58363387
Validation score: 0.947287
Iteration 2, loss = 0.20899391
Validation score: 0.965116
Iteration 3, loss = 0.10955170
Validation score: 0.965891
Iteration 4, loss = 0.09265796
Validation score: 0.966667
Iteration 5, loss = 0.08412127
Validation score: 0.967442
Iteration 6, loss = 0.07818920
Validation score: 0.965891
Iteration 7, loss = 0.07382743
Validation score: 0.968992
Iteration 8, loss = 0.06893742
Validation score: 0.968992
Iteration 9, loss = 0.06423078
Validation score: 0.964341
Iteration 10, loss = 0.05923172
Validation score: 0.968217
Iteration 11, loss = 0.05521229
Validation score: 0.970543
Iteration 12, loss = 0.05058528
Validation score: 0.968992
Iteration 13, loss = 0.04644050
Validation score: 0.971318
Iteration 14, loss = 0.04300472
Validation score: 0.971318
Iteration 15, loss = 0.03843515
Validation score: 0.968992
Iteration 16, loss = 0.03538009
Validation score: 0.971318
Iteration 17, loss = 0.03190343
Validation score: 0.972093
Iterat

In [20]:
#CELDA 8: Hard voting

#Define una pipeline que encadena varios pasos, en este caso la reduccion de la dimensionalidad y el modelo MLP
mlp_pipeline = Pipeline([
    ("svd", TruncatedSVD(n_components=300, random_state=RANDOM_STATE)),
    ("mlp", MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    random_state=RANDOM_STATE,
    max_iter=40,
    early_stopping=True,
    n_iter_no_change=3,
    verbose=True
    ))
])

#Se crea el modelo en ensamblado que combina ambos modelos (SVM y MLP)
ens = VotingClassifier(
    estimators=[
        ("svm", svm),
        ("mlp", mlp_pipeline)
    ],
    voting="hard"
)

ens.fit(x_train, y_train)

y_pred_xtest_ens = ens.predict(x_test)

#Metricas
accuracy_ens = accuracy_score(y_test, y_pred_xtest_ens)
precision_ens = precision_score(y_test, y_pred_xtest_ens, average="macro", zero_division=0)
recall_ens = recall_score(y_test, y_pred_xtest_ens, average="macro", zero_division=0)
f1_mac_ens = f1_score(y_test, y_pred_xtest_ens, average="macro", zero_division=0)       #Calcula f1 equitativamente cada clase
f1_wei_ens = f1_score(y_test, y_pred_xtest_ens, average="weighted", zero_division=0)    #Calcula f1 segun la frecuanecia real

print("\n.: HARD VOTING :.")
print(f"Accuracy: {accuracy_ens:.4f}")
print(f"Precison: {precision_ens:.4f}")
print(f"Recall: {recall_ens:.4f}")
print(f"F1-macro: {f1_mac_ens:.4f}")
print(f"F1-weighted: {f1_wei_ens:.4f}")

print("\nReport de clasificación HARD VOTING:\n")
print(classification_report(y_test, y_pred_xtest_ens, zero_division=0))

print("\nMatriz de confusion:\n")
print(confusion_matrix(y_test, y_pred_xtest_ens))



Iteration 1, loss = 0.58363387
Validation score: 0.947287
Iteration 2, loss = 0.20899391
Validation score: 0.965116
Iteration 3, loss = 0.10955170
Validation score: 0.965891
Iteration 4, loss = 0.09265796
Validation score: 0.966667
Iteration 5, loss = 0.08412127
Validation score: 0.967442
Iteration 6, loss = 0.07818920
Validation score: 0.965891
Iteration 7, loss = 0.07382743
Validation score: 0.968992
Iteration 8, loss = 0.06893742
Validation score: 0.968992
Iteration 9, loss = 0.06423078
Validation score: 0.964341
Iteration 10, loss = 0.05923172
Validation score: 0.968217
Iteration 11, loss = 0.05521229
Validation score: 0.970543
Iteration 12, loss = 0.05058528
Validation score: 0.968992
Iteration 13, loss = 0.04644050
Validation score: 0.971318
Iteration 14, loss = 0.04300472
Validation score: 0.971318
Iteration 15, loss = 0.03843515
Validation score: 0.968992
Iteration 16, loss = 0.03538009
Validation score: 0.971318
Iteration 17, loss = 0.03190343
Validation score: 0.972093
Iterat

In [21]:
#CELDA 9: Comparativa SVM vs MLP vs Ensemble

comparacion_modelos = pd.DataFrame({
    "Modelo": ["SVM", "MLP", "Ensemble (Hard Voting)"],
    "Accuracy": [float(accuracy_svm), float(accuracy_mlp), float(accuracy_ens)],
    "Precision": [float(precision_svm), float(precision_mlp), float(precision_ens)],
    "Recall": [float(recall_svm), float(recall_mlp), float(recall_ens)],
    "F1-macro": [float(f1_mac_svm), float(f1_mac_mlp), float(f1_mac_ens)],
    "F1-weighted": [float(f1_wei_svm), float(f1_wei_mlp), float(f1_wei_ens)],
})

cols_metricas = ["Accuracy", "Precision", "Recall", "F1-macro", "F1-weighted"]

comparacion_modelos.style.format(
    {col: "{:.4f}" for col in cols_metricas}
)

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-macro,F1-weighted
0,SVM,0.9768,0.9768,0.976,0.9764,0.9767
1,MLP,0.974,0.974,0.9732,0.9736,0.974
2,Ensemble (Hard Voting),0.9749,0.9761,0.9731,0.9745,0.9749


In [22]:
#CELDA 10: Guardado de los modelos finales

svm_path = OUTPUT_DIR / f"modelo_svm_{VERSION}.joblib"
mlp_path = OUTPUT_DIR / f"modelo_mlp_{VERSION}.joblib"
ens_path = OUTPUT_DIR / f"modelo_ens_{VERSION}.joblib"

joblib.dump(svm, svm_path)
joblib.dump(mlp, mlp_path)
joblib.dump(ens, ens_path)

print("Modelos guardados:")
print("SVM:", svm_path)
print("MLP:", svm_path)
print("Ensamble:", ens_path)

Modelos guardados:
SVM: C:\TFG\models\modelo_svm_eva_hard_voting.joblib
MLP: C:\TFG\models\modelo_svm_eva_hard_voting.joblib
Ensamble: C:\TFG\models\modelo_ens_eva_hard_voting.joblib


In [23]:
#CELDA 11: Guardar metricas en JSON

metricas = {
        "version": VERSION,
        "generated_at": datetime.now().isoformat(timespec="seconds"),
        "random_state": RANDOM_STATE,
        "modelos": {
            "SVM": {
                "accuracy": float(accuracy_svm),
                "precision": float(precision_svm),
                "recall": float(recall_svm),
                "F1-macro": float(f1_mac_svm),
                "F1-weighted": float(f1_wei_svm),
                "model_path": str(svm_path)
            },
            "MLP": {
                "accuracy": float(accuracy_mlp),
                "precision": float(precision_mlp),
                "recall": float(recall_mlp),
                "F1-macro": float(f1_mac_mlp),
                "F1-weighted": float(f1_wei_mlp),
                "model_path": str(mlp_path)
            },
            "Ensamble": {
                "accuracy": float(accuracy_ens),
                "precision": float(precision_ens),
                "recall": float(recall_ens),
                "F1-macro": float(f1_mac_ens),
                "F1-weighted": float(f1_wei_ens),
                "model_path": str(ens_path)
            }
        },
        "dispersion": {
            "train": float(disp_train),
            "test": float(disp_test)
        }
}

#Guarda el archivo JSON con todas las metricas al abrirlo en modo escritura "w"
metricas_path = OUTPUT_DIR / f"metricas_modelos_{VERSION}.json"
with open(metricas_path, "w", encoding="utf-8") as f:
    json.dump(metricas, f, indent=4, ensure_ascii=False)

print("Metricas guardadas en:", metricas_path)

Metricas guardadas en: C:\TFG\models\metricas_modelos_eva_hard_voting.json
