In [None]:
# Model Training Notebook
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from src.utils import load_config, setup_logging
from src.data_preprocessing import DataPreprocessor
from src.model_training import ModelTrainer
from src.evaluation import ModelEvaluator

# Setup
config = load_config("../config/params.yaml")
setup_logging()

# Load and prepare data
preprocessor = DataPreprocessor(config)
df = preprocessor.load_data()
df_clean = preprocessor.clean_data(df)
X_processed, y, fitted_preprocessor = preprocessor.prepare_data(df_clean)
X_train, X_test, y_train, y_test = preprocessor.split_data(X_processed, y)

print("Model training starting...")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Train models
model_trainer = ModelTrainer(config)
results = model_trainer.train_all_models(X_train, y_train, X_test, y_test)

# Evaluate models
evaluation_df = model_trainer.evaluate_models(X_test, y_test)
print("Model comparison:")
print(evaluation_df)

# Save results
evaluation_df.to_csv("../results/reports/model_comparison.csv", index=False)

# Save best model
if model_trainer.best_model:
    import joblib
    joblib.dump(model_trainer.best_model, "../models/best_model.pkl")
    joblib.dump(fitted_preprocessor, "../models/preprocessor.pkl")
    
    # Save model metadata
    model_metadata = {
        'best_model': type(model_trainer.best_model).__name__,
        'best_score': model_trainer.best_score,
        'training_date': pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
        'feature_count': X_train.shape[1]
    }
    
    import json
    with open("../models/model_metadata.json", 'w') as f:
        json.dump(model_metadata, f)

print("Model training completed!")