In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Add src to path
import sys
sys.path.append('..')

from src.config import FEATURED_DATA_PATH, SELECTED_DATA_PATH, TARGET_COLUMN, DATE_COLUMN
from src.feature_selection import (
    correlation_with_target, remove_highly_correlated,
    random_forest_importance, mutual_information_scores,
    recursive_feature_elimination, feature_selection_pipeline
)
from src.utils import save_csv, save_json

print("Libraries imported successfully!")

## 4.1 Load Featured Data

In [None]:
# Load featured data
df = pd.read_csv(FEATURED_DATA_PATH, parse_dates=[DATE_COLUMN])
print(f"Loaded data shape: {df.shape}")
print(f"Number of features: {df.shape[1] - 1}")

In [None]:
# Get numerical features only (exclude date and categorical)
exclude_cols = [DATE_COLUMN, 'holiday', 'weather_main', 'weather_description']
numerical_cols = [c for c in df.columns if c not in exclude_cols and df[c].dtype in ['int64', 'float64']]
print(f"Numerical features: {len(numerical_cols)}")

## 4.2 Correlation Analysis

In [None]:
# Calculate correlation with target
features = [c for c in numerical_cols if c != TARGET_COLUMN]
corr_scores, corr_selected = correlation_with_target(df, TARGET_COLUMN, threshold=0.05)

# Visualize top correlations
fig, ax = plt.subplots(figsize=(12, 10))
top_corr = corr_scores.head(30)
colors = ['green' if c >= 0.3 else 'orange' if c >= 0.1 else 'red' for c in top_corr.values]
top_corr.plot(kind='barh', ax=ax, color=colors)
ax.set_xlabel('Absolute Correlation with Target')
ax.set_title('Top 30 Features - Correlation with Traffic Volume')
ax.axvline(x=0.3, color='green', linestyle='--', label='Strong (>0.3)')
ax.axvline(x=0.1, color='orange', linestyle='--', label='Moderate (>0.1)')
ax.legend()
plt.tight_layout()
plt.show()

## 4.3 Remove Highly Correlated Features

In [None]:
# Remove features with multicollinearity > 0.95
features_filtered = [f for f in features if f in corr_selected]
print(f"Features after correlation filter: {len(features_filtered)}")

features_no_multicol = remove_highly_correlated(df, features_filtered, threshold=0.95)
print(f"Features after multicollinearity removal: {len(features_no_multicol)}")

## 4.4 Random Forest Feature Importance

In [None]:
# Calculate Random Forest importance
rf_importance, _ = random_forest_importance(df, features_no_multicol, TARGET_COLUMN, n_estimators=100)

# Visualize
fig, ax = plt.subplots(figsize=(12, 10))
rf_importance.head(30).plot(kind='barh', ax=ax, color='steelblue')
ax.set_xlabel('Feature Importance')
ax.set_title('Top 30 Features - Random Forest Importance')
plt.tight_layout()
plt.show()

## 4.5 Mutual Information

In [None]:
# Calculate Mutual Information
mi_scores, _ = mutual_information_scores(df, features_no_multicol, TARGET_COLUMN)

# Visualize
fig, ax = plt.subplots(figsize=(12, 10))
mi_scores.head(30).plot(kind='barh', ax=ax, color='coral')
ax.set_xlabel('Mutual Information Score')
ax.set_title('Top 30 Features - Mutual Information')
plt.tight_layout()
plt.show()

## 4.6 Combined Ranking

In [None]:
# Combine rankings from RF and MI
rf_ranks = rf_importance[features_no_multicol].rank(ascending=False)
mi_ranks = mi_scores[features_no_multicol].rank(ascending=False)

# Average rank
avg_ranks = (rf_ranks + mi_ranks) / 2
avg_ranks = avg_ranks.sort_values()

# Create ranking DataFrame
ranking_df = pd.DataFrame({
    'Feature': avg_ranks.index,
    'Avg_Rank': avg_ranks.values,
    'RF_Importance': rf_importance[avg_ranks.index].values,
    'MI_Score': mi_scores[avg_ranks.index].values
})

print("Top 30 Features by Combined Ranking:")
ranking_df.head(30)

## 4.7 Final Feature Selection

In [None]:
# Select top K features
TOP_K = 25  # Number of features to select

selected_features = avg_ranks.head(TOP_K).index.tolist()

print(f"\nSelected {len(selected_features)} features:")
for i, feat in enumerate(selected_features):
    print(f"{i+1:3d}. {feat}")

In [None]:
# Visualize selected features importance
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# RF Importance for selected features
rf_selected = rf_importance[selected_features].sort_values(ascending=True)
rf_selected.plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_xlabel('Feature Importance')
axes[0].set_title('Selected Features - Random Forest Importance')

# MI Score for selected features
mi_selected = mi_scores[selected_features].sort_values(ascending=True)
mi_selected.plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_xlabel('Mutual Information Score')
axes[1].set_title('Selected Features - Mutual Information')

plt.tight_layout()
plt.show()

## 4.8 Create Final Dataset

In [None]:
# Create final dataset with selected features
# Include: date_time, target, selected features
final_cols = [DATE_COLUMN, TARGET_COLUMN] + selected_features
df_selected = df[final_cols].copy()

print(f"Final dataset shape: {df_selected.shape}")
df_selected.head()

In [None]:
# Correlation matrix of selected features
fig, ax = plt.subplots(figsize=(14, 12))
corr_matrix = df_selected.drop(columns=[DATE_COLUMN]).corr()
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, ax=ax, fmt='.2f', annot_kws={'size': 8})
ax.set_title('Correlation Matrix - Selected Features')
plt.tight_layout()
plt.show()

## 4.9 Save Selected Features Data

In [None]:
# Save selected features data
save_csv(df_selected, SELECTED_DATA_PATH, index=False)

# Save feature list
feature_info = {
    'selected_features': selected_features,
    'n_features': len(selected_features),
    'target': TARGET_COLUMN,
    'ranking': ranking_df.head(TOP_K).to_dict('records')
}

import os
save_json(feature_info, os.path.join(os.path.dirname(SELECTED_DATA_PATH), 'selected_features_info.json'))

print(f"\nSelected features data saved to: {SELECTED_DATA_PATH}")

## Summary

**Feature Selection completed:**
1. ✅ Correlation analysis with target
2. ✅ Removed multicollinear features (>0.95)
3. ✅ Random Forest feature importance
4. ✅ Mutual Information scores
5. ✅ Combined ranking
6. ✅ Selected top features
7. ✅ Saved selected features data

**Next step:** Data Preparation (05_Data_Preparation.ipynb)