# VIF-Based Feature Selection for SARIMAX Model
This notebook performs feature selection using Variance Inflation Factor (VIF) analysis to address multicollinearity in the exogenous predictors for the SARIMAX model.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Load and Prepare Data
Load the dataset containing exogenous predictors from our previous SARIMAX analysis.

In [None]:
# Load the dataset
# Assuming the data is stored in a CSV file
data_path = Path('data/processed/exog_predictors.csv')
df = pd.read_csv(data_path, index_col=0, parse_dates=True)

print("Dataset shape:", df.shape)
print("\nFeatures available:", df.columns.tolist())

# Calculate Variance Inflation Factor (VIF)
Calculate VIF scores for all predictors in the dataset.

In [None]:
def calculate_vif(X):
    """
    Calculate VIF for each feature in the dataset
    
    Parameters:
    X (pd.DataFrame): Features dataframe
    
    Returns:
    pd.DataFrame: VIF scores for each feature
    """
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                       for i in range(X.shape[1])]
    return vif_data.sort_values('VIF', ascending=False)

# Calculate initial VIF scores
vif_scores = calculate_vif(df)
print("Initial VIF Scores:")
print(vif_scores)

# Identify High VIF Features
Identify features with VIF scores above our threshold of 10.

In [None]:
VIF_THRESHOLD = 10

# Identify features with high VIF
high_vif_features = vif_scores[vif_scores['VIF'] > VIF_THRESHOLD]
print("Features with VIF > 10:")
print(high_vif_features)

# Iterative Feature Removal
Iteratively remove features with the highest VIF scores until all remaining features have VIF < 10.

In [None]:
def iterative_vif_selection(X, threshold=10):
    """
    Iteratively remove features with highest VIF until all features have VIF < threshold
    
    Parameters:
    X (pd.DataFrame): Features dataframe
    threshold (float): VIF threshold for feature removal
    
    Returns:
    pd.DataFrame: DataFrame with selected features
    list: Removed features
    """
    features = X.columns.tolist()
    removed_features = []
    
    while True:
        vif = calculate_vif(X[features])
        max_vif = vif['VIF'].max()
        
        if max_vif < threshold:
            break
            
        max_vif_feature = vif.loc[vif['VIF'] == max_vif, 'Feature'].iloc[0]
        features.remove(max_vif_feature)
        removed_features.append(max_vif_feature)
        
        print(f"Removed {max_vif_feature} with VIF: {max_vif:.2f}")
    
    return X[features], removed_features

# Perform iterative VIF selection
final_df, removed_features = iterative_vif_selection(df, VIF_THRESHOLD)

# Report Final Feature Set
Display the final set of features with their VIF scores after removing multicollinear features.

In [None]:
# Calculate final VIF scores
final_vif_scores = calculate_vif(final_df)

print("Final VIF Scores:")
print(final_vif_scores)

print("\nNumber of features retained:", len(final_df.columns))
print("Number of features removed:", len(removed_features))
print("\nRemoved features:", removed_features)

# Save the final feature set
final_df.to_csv('data/processed/vif_selected_features.csv')

# VIF-Based Feature Selection for SARIMAX Model

This notebook implements feature selection using Variance Inflation Factor (VIF) to address multicollinearity in our SARIMAX model's exogenous predictors.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import seaborn as sns

## Load and Prepare Data
Load the dataset containing our exogenous predictors and ensure it's ready for VIF calculation.

In [None]:
# Load the data
# Assuming data is stored in a CSV file
df = pd.read_csv('processed_data.csv')
df.set_index('date', inplace=True)

# Display the first few rows and basic information
print("Dataset Overview:")
print(df.head())
print("\nDataset Info:")
df.info()

## Calculate Variance Inflation Factor (VIF)
Calculate VIF for each predictor variable to identify multicollinearity.

In [None]:
def calculate_vif(X):
    """
    Calculate VIF for each feature in the dataset
    
    Parameters:
    X (pd.DataFrame): Features dataframe
    
    Returns:
    pd.DataFrame: VIF scores for each feature
    """
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                       for i in range(X.shape[1])]
    return vif_data.sort_values('VIF', ascending=False)

# Calculate initial VIF scores
initial_vif = calculate_vif(df)
print("Initial VIF Scores:")
print(initial_vif)

## Identify High VIF Features
Identify features with VIF scores above our threshold of 10.

In [None]:
# Plot VIF scores
plt.figure(figsize=(12, 6))
sns.barplot(data=initial_vif, x='Feature', y='VIF')
plt.xticks(rotation=45)
plt.title('VIF Scores for All Features')
plt.tight_layout()
plt.show()

# Identify features with high VIF
high_vif_features = initial_vif[initial_vif['VIF'] > 10]
print("\nFeatures with VIF > 10:")
print(high_vif_features)

## Iterative Feature Removal
Implement iterative process to remove features with high VIF scores until all remaining features have VIF < 10.

In [None]:
def iterative_vif_selection(data, threshold=10):
    """
    Iteratively remove features with highest VIF until all features have VIF < threshold
    
    Parameters:
    data (pd.DataFrame): Input features
    threshold (float): VIF threshold for feature removal
    
    Returns:
    pd.DataFrame: DataFrame with selected features
    list: Removed features
    """
    features = data.columns.tolist()
    removed_features = []
    
    while True:
        vif = calculate_vif(data[features])
        max_vif = vif['VIF'].max()
        
        if max_vif < threshold:
            break
            
        # Remove feature with highest VIF
        feature_to_remove = vif.loc[vif['VIF'].idxmax(), 'Feature']
        features.remove(feature_to_remove)
        removed_features.append(feature_to_remove)
        
        print(f"Removed {feature_to_remove} with VIF: {max_vif:.2f}")
    
    return data[features], removed_features

# Perform iterative VIF selection
final_df, removed_features = iterative_vif_selection(df)

## Report Final Feature Set
Display the final set of features with their VIF scores, which will be used for re-specifying the SARIMAX model.

In [None]:
# Calculate final VIF scores
final_vif = calculate_vif(final_df)

print("Final VIF Scores:")
print(final_vif)

print("\nRemoved Features:")
print(removed_features)

# Plot final VIF scores
plt.figure(figsize=(12, 6))
sns.barplot(data=final_vif, x='Feature', y='VIF')
plt.xticks(rotation=45)
plt.title('VIF Scores for Final Feature Set')
plt.tight_layout()
plt.show()

# Save final feature set
final_df.to_csv('vif_selected_features.csv')
print("\nFinal feature set saved to 'vif_selected_features.csv'")