# Feature Selection Using Variance Inflation Factor (VIF)
This notebook implements a systematic approach to refine the feature set by identifying and addressing multicollinearity using Variance Inflation Factor (VIF) analysis.

## Import Required Libraries
Import necessary libraries for VIF calculation and data manipulation.

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

## Load and Prepare Data
Load the dataset containing the exogenous predictors from the previous SARIMAX model.

In [None]:
# Load the dataset
# Assuming the data is saved in a CSV file
df = pd.read_csv('preprocessed_data.csv')

# Display the first few rows and basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
display(df.head())

## Calculate Variance Inflation Factor (VIF)
Define a function to calculate VIF for all predictors in the dataset.

In [None]:
def calculate_vif(X):
    """
    Calculate VIF for each feature in the dataset
    
    Parameters:
    X (pd.DataFrame): Features dataframe
    
    Returns:
    pd.DataFrame: DataFrame with feature names and their VIF scores
    """
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data.sort_values('VIF', ascending=False)

# Calculate initial VIF scores
vif_scores = calculate_vif(df)
print("Initial VIF Scores:")
display(vif_scores)

## Identify High VIF Features
Set a threshold and identify features with high multicollinearity.

In [None]:
# Set VIF threshold
VIF_THRESHOLD = 10

# Identify features with high VIF
high_vif_features = vif_scores[vif_scores['VIF'] > VIF_THRESHOLD]
print(f"Features with VIF > {VIF_THRESHOLD}:")
display(high_vif_features)

## Iterative Feature Removal
Implement iterative removal of high VIF features until all remaining features have VIF below the threshold.

In [None]:
def iterative_vif_removal(data, threshold=10):
    """
    Iteratively remove features with highest VIF until all features have VIF below threshold
    
    Parameters:
    data (pd.DataFrame): Input features
    threshold (float): VIF threshold for feature removal
    
    Returns:
    pd.DataFrame: DataFrame with remaining features
    list: List of removed features
    """
    X = data.copy()
    removed_features = []
    
    while True:
        vif = calculate_vif(X)
        max_vif = vif['VIF'].max()
        
        if max_vif < threshold:
            break
            
        feature_to_remove = vif.loc[vif['VIF'] == max_vif, 'Feature'].iloc[0]
        removed_features.append((feature_to_remove, max_vif))
        X = X.drop(feature_to_remove, axis=1)
        
        print(f"Removed {feature_to_remove} with VIF: {max_vif:.2f}")
    
    return X, removed_features

# Perform iterative VIF removal
final_features_df, removed_features = iterative_vif_removal(df, VIF_THRESHOLD)

## Report Final Feature Set
Display the final set of features with their VIF scores.

In [None]:
# Calculate final VIF scores
final_vif_scores = calculate_vif(final_features_df)

print("Final Feature Set VIF Scores:")
display(final_vif_scores)

print("\nSummary:")
print(f"Original number of features: {df.shape[1]}")
print(f"Final number of features: {final_features_df.shape[1]}")
print(f"Number of features removed: {len(removed_features)}")

# Save the final feature set
final_features_df.to_csv('final_features.csv', index=False)
print("\nFinal feature set saved to 'final_features.csv'")