In [1]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    cohen_kappa_score
)

# Cargar datos desde UCI
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
    names=column_names, na_values=' ?', sep=',\s', engine='python'
)

# Eliminar filas con datos faltantes
df = df.dropna()

# Codificar variable objetivo
df['income'] = LabelEncoder().fit_transform(df['income'])

# Separar features y target
X = df.drop("income", axis=1)
y = df["income"]

# Codificar variables categóricas
cat_cols = X.select_dtypes(include=['object']).columns
for col in cat_cols:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Escalar variables numéricas
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[num_cols] = StandardScaler().fit_transform(X[num_cols])

# Dividir dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:
# Modelo 1: Random Forest con los mejores parámetros
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Modelo 2: SVM con los mejores parámetros
svm_model = SVC(C=10, kernel='rbf')
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)


In [3]:
# Accuracy de cada modelo
acc_rf = accuracy_score(y_test, rf_preds)
acc_svm = accuracy_score(y_test, svm_preds)

print(f"Accuracy Random Forest: {acc_rf:.4f}")
print(f"Accuracy SVM:           {acc_svm:.4f}")

# Reporte de clasificación
print("\n--- Random Forest ---")
print(classification_report(y_test, rf_preds))

print("\n--- SVM ---")
print(classification_report(y_test, svm_preds))


Accuracy Random Forest: 0.8676
Accuracy SVM:           0.8509

--- Random Forest ---
              precision    recall  f1-score   support

           0       0.89      0.94      0.92      4942
           1       0.77      0.64      0.70      1571

    accuracy                           0.87      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.86      0.87      0.86      6513


--- SVM ---
              precision    recall  f1-score   support

           0       0.87      0.94      0.91      4942
           1       0.76      0.57      0.65      1571

    accuracy                           0.85      6513
   macro avg       0.81      0.75      0.78      6513
weighted avg       0.84      0.85      0.84      6513



In [4]:
# Comparación entre las predicciones
kappa = cohen_kappa_score(rf_preds, svm_preds)
print(f"\nCohen's Kappa entre modelos: {kappa:.4f}")




Cohen's Kappa entre modelos: 0.7996
