<a href="https://colab.research.google.com/github/OswaldGutierrez/Modelos-IA/blob/main/SVM2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SUPPORT VECTOR MACHINE - SVM LINEAL

<font size="3">**1. IMPORTACIÓN DE LIBRERÍAS NECESARIAS**</font>

In [1]:
import numpy as np
import pandas as pd
import itertools
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

<font size="3">**2. CARGA DEL DATASET**</font>

In [2]:
!wget https://raw.githubusercontent.com/xiomara-udea/NATICUSdroid/main/data.csv
import pandas as pd
df = pd.read_csv("data.csv")

# Separar características (X) y etiqueta (y)
X = df.drop('Result', axis=1)
y = df['Result']

print(f"  - Características: {X.shape[1]}")
print(f"  - Muestras: {len(y)}")


--2025-11-24 13:42:08--  https://raw.githubusercontent.com/xiomara-udea/NATICUSdroid/main/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5136567 (4.9M) [text/plain]
Saving to: ‘data.csv.2’


2025-11-24 13:42:08 (58.7 MB/s) - ‘data.csv.2’ saved [5136567/5136567]

  - Características: 86
  - Muestras: 29332


 <font size="3">**3. DIVISIÓN TRAIN - TEST (70/30)**</font>

In [3]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)
print("Train shape:", X_train_full.shape, "Test shape:", X_test.shape)

Train shape: (20532, 86) Test shape: (8800, 86)


 <font size="3">**4. DEFINICIÓN DE HIPERPARÁMETROS**</font>

In [4]:
Cs = [0.01, 0.1, 1, 10, 100]
kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)
max_iter = 5000
tol_margin = 1e-8

results = []
start_time = time.time()

# función bootstrap para IC95
def bootstrap_ci(y_true, y_pred, metric_fn, B=1000, seed=RANDOM_STATE):
    rng = np.random.default_rng(seed)
    n = len(y_true)
    stats = []
    for _ in range(B):
        idxs = rng.integers(0, n, n)
        stats.append(metric_fn(y_true[idxs], y_pred[idxs]))
    return np.percentile(stats, [2.5, 97.5])


 <font size="3">**5. CROSS VALIDATION - GRID SEARCH**</font>

In [5]:
print("\nEjecutando CV grid search LinearSVC...\n")
for C in Cs:
    accs, precs, recs, f1s, aucs = [], [], [], [], []
    pct_svs = []
    oof_y, oof_pred, oof_dec = [], [], []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train_full, y_train_full)):

        Xtr = X_train_full.iloc[train_idx]
        Xval = X_train_full.iloc[val_idx]
        ytr = y_train_full.iloc[train_idx]
        yval = y_train_full.iloc[val_idx]

        # Escalado
        scaler = StandardScaler().fit(Xtr)
        Xtr_s = scaler.transform(Xtr)
        Xval_s = scaler.transform(Xval)

        # Entrenar modelo
        lsvc = LinearSVC(
            C=C, loss='squared_hinge', penalty='l2',
            max_iter=max_iter, dual=False, random_state=RANDOM_STATE
        )
        lsvc.fit(Xtr_s, ytr)

        # Predicciones
        yval_pred = lsvc.predict(Xval_s)
        try:
            yval_dec = lsvc.decision_function(Xval_s)
        except:
            yval_dec = None

        # Métricas
        accs.append(accuracy_score(yval, yval_pred))
        precs.append(precision_score(yval, yval_pred, zero_division=0))
        recs.append(recall_score(yval, yval_pred, zero_division=0))
        f1s.append(f1_score(yval, yval_pred, zero_division=0))

        if yval_dec is not None:
            try:
                aucs.append(roc_auc_score(yval, yval_dec))
            except:
                aucs.append(np.nan)
        else:
            aucs.append(np.nan)

        # % vectores de soporte aprox
        try:
            train_dec = lsvc.decision_function(Xtr_s)
            margins = ytr * train_dec
            pct_sv = np.mean(margins <= (1.0 + tol_margin)) * 100
        except:
            pct_sv = np.nan
        pct_svs.append(pct_sv)

        # Out-of-fold
        oof_y.append(yval)
        oof_pred.append(yval_pred)
        oof_dec.append(
            yval_dec if yval_dec is not None else np.zeros_like(yval_pred)
        )

    # Concatenar OOF
    oof_y_all = np.concatenate(oof_y)
    oof_pred_all = np.concatenate(oof_pred)
    oof_dec_all = np.concatenate(oof_dec)

    # Bootstrap CI
    try:
        acc_ci = bootstrap_ci(
            oof_y_all, oof_pred_all,
            lambda a,b: accuracy_score(a,b), B=1000
        )
        f1_ci = bootstrap_ci(
            oof_y_all, oof_pred_all,
            lambda a,b: f1_score(a,b, zero_division=0), B=1000
        )
    except:
        acc_ci = (np.nan, np.nan)
        f1_ci = (np.nan, np.nan)

    # Guardar resultados
    results.append({
        "C": C,
        "acc_val_mean": np.mean(accs),
        "acc_val_std": np.std(accs, ddof=1),
        "acc_val_ci95_low": acc_ci[0],
        "acc_val_ci95_high": acc_ci[1],
        "prec_val_mean": np.mean(precs),
        "prec_val_std": np.std(precs, ddof=1),
        "rec_val_mean": np.mean(recs),
        "rec_val_std": np.std(recs, ddof=1),
        "f1_val_mean": np.mean(f1s),
        "f1_val_std": np.std(f1s, ddof=1),
        "f1_val_ci95_low": f1_ci[0],
        "f1_val_ci95_high": f1_ci[1],
        "auc_val_mean": np.nanmean(aucs),
        "%_sv_mean": np.nanmean(pct_svs)
    })

end_time = time.time()
print(f"\nGrid finished in {end_time - start_time:.1f} s")

results_df = pd.DataFrame(results).sort_values("C")
display(results_df)




Ejecutando CV grid search LinearSVC...


Grid finished in 42.9 s


Unnamed: 0,C,acc_val_mean,acc_val_std,acc_val_ci95_low,acc_val_ci95_high,prec_val_mean,prec_val_std,rec_val_mean,rec_val_std,f1_val_mean,f1_val_std,f1_val_ci95_low,f1_val_ci95_high,auc_val_mean,%_sv_mean
0,0.01,0.959965,0.000934,0.957285,0.962595,0.954849,0.002422,0.965792,0.002101,0.960286,0.000909,0.957458,0.962931,0.988843,65.39548
1,0.1,0.960111,0.000998,0.957528,0.962741,0.955387,0.002357,0.965501,0.002316,0.960414,0.000988,0.957688,0.963063,0.988686,64.702253
2,1.0,0.960062,0.001067,0.957432,0.962692,0.955295,0.002388,0.965501,0.002316,0.960367,0.001056,0.95765,0.963017,0.988635,64.632444
3,10.0,0.960014,0.00106,0.957286,0.962644,0.955204,0.002527,0.965501,0.002316,0.960321,0.001041,0.95755,0.962925,0.988616,64.629197
4,100.0,0.960062,0.001067,0.957432,0.962692,0.955295,0.002388,0.965501,0.002316,0.960367,0.001056,0.95765,0.963017,0.988615,64.629197


<font size="3">**6. SELECCIÓN DEL MEJOR MODELO**</font>

In [None]:
best_row = results_df.loc[results_df["f1_val_mean"].idxmax()]
print("\nMejor configuración (por F1 CV):")
print(best_row)

# === Predicción final en test (la variable que faltaba) ===
y_test_pred = best_svm.predict(X_test_s)

try:
    y_test_dec = best_svm.decision_function(X_test_s)
except:
    y_test_dec = None


<font size="3">**7. ENTRENAMIENTO FINAL CON EL MEJOR HIPERPARÁMETRO**</font>

In [None]:
# Convertir Series a numpy antes del bootstrap
y_test_np = np.asarray(y_test)
y_test_pred_np = np.asarray(y_test_pred)

print("\n=== Test final results ===")
print(f"Best C: {best_C}")
print(f"Accuracy: {acc_test:.4f}")
print(f"Precision: {prec_test:.4f}")
print(f"Recall: {rec_test:.4f}")
print(f"F1: {f1_test:.4f}")
print(f"AUC (decision_function): {auc_test:.4f}")
print(f"% vectores soporte (approx, train): {pct_sv_final:.2f}%")

# IC95 on test via bootstrap
def bootstrap_metric_pairs(y_true, y_pred, metric_fn, B=1000, seed=RANDOM_STATE):
    rng = np.random.default_rng(seed)
    n = len(y_true)
    stats = []
    for _ in range(B):
        idxs = rng.integers(0, n, n)  # random positions
        stats.append(metric_fn(y_true[idxs], y_pred[idxs]))
    return np.percentile(stats, [2.5, 97.5])

# AHORA usando las versiones numpy
acc_ci_test = bootstrap_metric_pairs(y_test_np, y_test_pred_np,
                                    lambda a,b: accuracy_score(a,b))
f1_ci_test = bootstrap_metric_pairs(y_test_np, y_test_pred_np,
                                   lambda a,b: f1_score(a,b, zero_division=0))

print("IC95 Accuracy (test):", acc_ci_test)
print("IC95 F1 (test):", f1_ci_test)


<font size="3">**8. VISUALIZACIÓN DE RESULTADOS**</font>

In [None]:
# F1 vs C
plt.figure(figsize=(8,5))
plt.plot(results_df["C"], results_df["f1_val_mean"], marker='o')
plt.xscale('log')
plt.xlabel("C (escala log)")
plt.ylabel("F1 (validación, promedio)")
plt.title("Efecto de C sobre F1 (LinearSVC) — CV 4-fold")
plt.grid(True)
plt.show()
# Caption: "Efecto de C sobre la métrica F1 en validación (media por fold). Eje x: C (escala log). Eje y: F1 promedio."

# % support vs C
plt.figure(figsize=(8,5))
plt.plot(results_df["C"], results_df["%_sv_mean"], marker='o')
plt.xscale('log')
plt.xlabel("C (escala log)")
plt.ylabel("% vectores de soporte (aprox.)")
plt.title("% vectores de soporte aproximado vs C (LinearSVC)")
plt.grid(True)
plt.show()
# Caption: "% vectores de soporte (aprox.) en función de C; eje x: C en escala log; eje y: % de muestras en margen."

# Heatmap F1 vs C (simple)
plt.figure(figsize=(6,4))
sns.heatmap(results_df[["C","f1_val_mean"]].set_index("C").T, annot=True, fmt=".3f", cmap="viridis")
plt.title("F1 promedio por valor de C (LinearSVC)")
plt.xlabel("C")
plt.show()
# Caption: "Tabla F1 promedio por valor de C (CV). Eje x: C; valores anotados: F1 promedio."