In [None]:
# Feature Engineering Notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from src.utils import load_config, setup_logging
from src.data_preprocessing import DataPreprocessor
from src.feature_engineering import FeatureEngineer

# Setup
config = load_config("../config/params.yaml")
setup_logging()

# Load and preprocess data
preprocessor = DataPreprocessor(config)
df = preprocessor.load_data()
df_clean = preprocessor.clean_data(df)
X_processed, y, fitted_preprocessor = preprocessor.prepare_data(df_clean)

print("Feature engineering starting...")
print(f"Original features: {X_processed.shape[1]}")

# Feature engineering
feature_engineer = FeatureEngineer(config)

# 1. Basic feature importance
importance_df = feature_engineer.calculate_feature_importance(X_processed, y)
print("Top 10 features by importance:")
print(importance_df.head(10))

# 2. Feature selection using ANOVA
X_anova, selected_anova, anova_scores = feature_engineer.select_features_anova(X_processed, y, k=20)
print(f"Selected {len(selected_anova)} features using ANOVA")

# 3. Feature selection using RFE
X_rfe, selected_rfe, rfe_ranking = feature_engineer.select_features_rfe(X_processed, y, n_features=15)
print(f"Selected {len(selected_rfe)} features using RFE")

# Save results
importance_df.to_csv("../results/reports/feature_importance.csv", index=False)
anova_scores.to_csv("../results/reports/anova_scores.csv", index=False)
rfe_ranking.to_csv("../results/reports/rfe_ranking.csv", index=False)

print("Feature engineering completed!")