In [2]:
#pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m724.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.1
Note: you may need to restart the kernel to use updated packages.


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder

In [18]:
# Cargar el dataset
X, y = fetch_openml("adult", version=1, as_frame=True, return_X_y=True)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Identificar columnas numéricas y categóricas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['category', 'object']).columns

# Codificar las etiquetas de y (variable objetivo)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [19]:
# Crear pipelines de preprocesamiento
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

# Combinar preprocesadores
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Crear pipelines para cada modelo
models = {
    "Random Forest": Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ]),
    "Naive Bayes" : Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GaussianNB())
    ]),
     "Decision Tree":  Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ]),
    "Gradient Boosting": Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ]),
    "XGBoost":  Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(n_estimators=100, random_state=42))
    ]),
    "AdaBoost":  Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', AdaBoostClassifier(n_estimators=100, random_state=42))
    ]),
    "AdaBoost":  Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', AdaBoostClassifier(n_estimators=100, random_state=42))
    ]),
    "K-Nearest Neighbors":   Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier(n_neighbors=3))
    ]),

   "Support Vector Machine":  Pipeline([
        ('preprocessor', preprocessor),
        ('classifier',SVC(kernel='rbf', random_state=42))
    ]),
    "Logistic Regression":   Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42))
    ]),
}

In [20]:
# Train y Evaluar models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))


# Print overall results
print("\nOverall Results:")
for name, accuracy in results.items():
    print(f"{name}: {accuracy:.4f}")


Random Forest Results:
Accuracy: 0.8282
              precision    recall  f1-score   support

           0       0.87      0.90      0.89     11109
           1       0.66      0.59      0.62      3544

    accuracy                           0.83     14653
   macro avg       0.77      0.75      0.76     14653
weighted avg       0.82      0.83      0.82     14653


Naive Bayes Results:
Accuracy: 0.7122
              precision    recall  f1-score   support

           0       0.96      0.65      0.77     11109
           1       0.45      0.91      0.60      3544

    accuracy                           0.71     14653
   macro avg       0.70      0.78      0.69     14653
weighted avg       0.84      0.71      0.73     14653


Decision Tree Results:
Accuracy: 0.7982
              precision    recall  f1-score   support

           0       0.87      0.86      0.87     11109
           1       0.58      0.60      0.59      3544

    accuracy                           0.80     14653
   macr




AdaBoost Results:
Accuracy: 0.8540
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     11109
           1       0.75      0.60      0.66      3544

    accuracy                           0.85     14653
   macro avg       0.81      0.77      0.79     14653
weighted avg       0.85      0.85      0.85     14653


K-Nearest Neighbors Results:
Accuracy: 0.8144
              precision    recall  f1-score   support

           0       0.86      0.90      0.88     11109
           1       0.63      0.56      0.59      3544

    accuracy                           0.81     14653
   macro avg       0.75      0.73      0.74     14653
weighted avg       0.81      0.81      0.81     14653


Support Vector Machine Results:
Accuracy: 0.8547
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     11109
           1       0.77      0.57      0.65      3544

    accuracy                           0.85     1

Usar un dataset como MNIST o Sentiment Analysis.
Deben:
1. Implementar un modelo base con hiperparámetros por defecto.
1. Aplicar técnicas como Grid Search.
1. Comparar el rendimiento y el tiempo de ejecución de cada método.
1. Discutir los trade-offs entre tiempo de computación y mejora del rendimiento.