# Modeling notebook

In [None]:
# Updated to use new modular structure
import os
import pandas as pd
from src.data_load import load_dataset, train_test_split
from src.cleaning import quick_clean
from src.features import safe_feature_columns
from src.training import train_best_classifier

# Load data
dataset_path = os.environ.get('DATASET_PATH', '../DATA/recuima-020425-fragment.csv')
df = load_dataset(dataset_path)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, strategy="stratified", target_column="mortalidad")

# Clean data
train_clean = quick_clean(train_df, target_column="mortalidad")
test_clean = quick_clean(test_df, target_column="mortalidad")

# Prepare features
feature_cols = safe_feature_columns(train_clean, exclude_cols=["mortalidad"])
X_train = train_clean[feature_cols]
y_train = train_clean["mortalidad"]
X_test = test_clean[feature_cols]
y_test = test_clean["mortalidad"]

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Train best classifier
print("\nTraining best classifier...")
model, results = train_best_classifier(
    X_train, y_train,
    model_name="xgboost",
    cv_folds=3,  # Quick mode
    use_smote=True
)

print(f"\nBest model trained!")
print(f"Cross-validation score: {results.get('cv_score', 'N/A')}")
