In [37]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score

In [38]:
# Cargar los datos
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")






In [39]:
# Inspeccionar los datos
print("Train dataset shape:", train.shape)
print("Test dataset shape:", test.shape)
print(train.head())



Train dataset shape: (237, 14)
Test dataset shape: (60, 14)
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  39.0  0.0  3.0     138.0  220.0  0.0      0.0    152.0    0.0      0.0   
1  60.0  0.0  1.0     150.0  240.0  0.0      0.0    171.0    0.0      0.9   
2  69.0  0.0  1.0     140.0  239.0  0.0      0.0    151.0    0.0      1.8   
3  58.0  1.0  2.0     120.0  284.0  0.0      2.0    160.0    0.0      1.8   
4  47.0  1.0  3.0     130.0  253.0  0.0      0.0    179.0    0.0      0.0   

   slope   ca  thal  target  
0    2.0  0.0   3.0       0  
1    1.0  0.0   3.0       0  
2    1.0  2.0   3.0       0  
3    2.0  0.0   3.0       1  
4    1.0  0.0   3.0       0  


In [40]:
# Verificar el nombre correcto de la columna objetivo
print("Columnas del dataset:", train.columns)



Columnas del dataset: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


In [41]:
target_column = 'target'  # Asegúrate de que el nombre sea correcto
if target_column not in train.columns:
    raise ValueError(f"La columna objetivo '{target_column}' no está en el dataset. Verifica el nombre correcto.")



In [42]:
# Identificar variables numéricas y categóricas
numeric_features = train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train.select_dtypes(exclude=[np.number]).columns.tolist()



In [43]:
# Remover la variable objetivo de las características si está en la lista
if target_column in numeric_features:
    numeric_features.remove(target_column)



In [44]:
# Definir los transformadores
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])



In [45]:
# Crear el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)



In [46]:
# Definir el modelo
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])



In [47]:
# Separar features y target
y = train[target_column]
X = train.drop(columns=[target_column])
test = test[X.columns]


In [48]:


# Dividir en conjunto de entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [49]:
# Entrenar el modelo
model.fit(X_train, y_train)



In [50]:
# Evaluación con validación cruzada
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print("MAE promedio:", -scores.mean())



MAE promedio: 0.31779943100995733


In [51]:
# Predicciones en el conjunto de prueba
test_predictions = model.predict(test)



In [52]:
# Guardar predicciones
output = pd.DataFrame({'Id': test.index, 'Predicted': test_predictions})
output.to_csv('submission.csv', index=False)

print("Pipeline completado y predicciones guardadas en submission.csv")

Pipeline completado y predicciones guardadas en submission.csv
