In [None]:
# Cell 1
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
import pandas as pd
from mrmr import mrmr_classif

# Setup path to import src
current_dir = Path.cwd()
root_dir = current_dir.parent
if str(root_dir) not in sys.path:
    sys.path.append(str(root_dir))

from src import config, preprocessing

sample_size = 200000
print(f"üöÄ Loading and sampling {sample_size} rows of data...")

# Returns X_df (DataFrame with full columns), y (Series), and valid_cols (list of valid feature columns)
X_df, y, valid_cols = preprocessing.load_data_for_mrmr(sample_size=sample_size)

if X_df is not None:
    print(f"‚úÖ Load successful!")
    print(f"   - Data shape: {X_df.shape}")
    print(f"   - Number of candidate features: {len(valid_cols)}")
    print(f"   - Columns (Example): {list(X_df.columns[:5])} ...")
else:
    print("‚ùå Data loading error. Please check src/config.py")

# Cell 3
if X_df is not None:
    print("‚è≥ Running mRMR algorithm (May take 5-10 minutes)...")
    print("   (Perfect time to go grab a cup of coffee!)")

    # Select Top features
    selected_features = mrmr_classif(X=X_df, y=y, K=67)

    print("\n" + "="*50)
    print("üéâ TOP FEATURES RESULTS")
    print("="*50)
    print(selected_features)
    print("="*50)
    print("üëâ INSTRUCTIONS: Copy the list above and overwrite the SELECTED_FEATURES variable in src/config.py")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src import config, preprocessing

# 1. Reload data (only need a small sample for faster correlation calculation)
print("Loading data for correlation analysis...")
df, _ = preprocessing.load_single_dataset_year('2017', binary_mode=True)
df_sample = df.sample(n=50000, random_state=config.RANDOM_STATE) # 50k samples is enough to see correlation

# 2. Define mRMR feature list (Top-30 covering Top-25)
# Note: Ensure you use the correct variable name from your config file
top_30_features = config.mRMR_FEATURES[:25] 

# 3. Calculate correlation matrix
corr_matrix = df_sample[top_30_features].corr()

# 4. Plot Heatmap
plt.figure(figsize=(18, 14))
sns.heatmap(corr_matrix, 
            annot=False, 
            cmap='coolwarm', 
            center=0,
            linewidths=0.2)

plt.title('Correlation Heatmap of Top-30 mRMR Features', fontsize=16)
plt.tight_layout()

# 5. Save image for the report
plt.savefig("mrmr_top30_correlation.png", dpi=300)
plt.show()