# Feature Refinement Using Variance Inflation Factor (VIF)

This notebook focuses on refining the feature set by calculating and addressing multicollinearity using VIF before re-specifying the SARIMAX model.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import warnings
warnings.filterwarnings('ignore')

# Load and Prepare Data

Load the dataset containing exogenous predictors from the previous SARIMAX model.

In [None]:
# Load the dataset
# Assuming the data is saved in a CSV file
df = pd.read_csv('path_to_your_data.csv')

# Display the first few rows and basic information
print("Dataset Overview:")
print(df.head())
print("\nDataset Info:")
print(df.info())

# Calculate Variance Inflation Factor (VIF)

Calculate VIF scores for each predictor to identify multicollinearity.

In [None]:
def calculate_vif(X):
    """
    Calculate VIF for each feature in the dataset
    
    Parameters:
    X (pd.DataFrame): Features dataframe
    
    Returns:
    pd.DataFrame: VIF scores for each feature
    """
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                       for i in range(X.shape[1])]
    return vif_data.sort_values('VIF', ascending=False)

# Select only numeric columns for VIF calculation
numeric_features = df.select_dtypes(include=[np.number]).columns
X = df[numeric_features]

# Calculate initial VIF scores
vif_scores = calculate_vif(X)
print("Initial VIF Scores:")
print(vif_scores)

# Identify High VIF Features

Identify features with VIF scores above the threshold (10).

In [None]:
# Set VIF threshold
VIF_THRESHOLD = 10

# Identify features with high VIF
high_vif_features = vif_scores[vif_scores['VIF'] > VIF_THRESHOLD]
print("Features with VIF > {}:".format(VIF_THRESHOLD))
print(high_vif_features)

# Iterative Feature Removal

Implement iterative process to remove features with high VIF scores.

In [None]:
def iterative_vif_removal(X, threshold=10):
    """
    Iteratively remove features with highest VIF until all features have VIF below threshold
    
    Parameters:
    X (pd.DataFrame): Features dataframe
    threshold (float): VIF threshold
    
    Returns:
    list: Features to keep
    """
    features = X.columns.tolist()
    while True:
        vif = calculate_vif(X[features])
        max_vif = vif['VIF'].max()
        
        if max_vif < threshold:
            break
            
        max_feature = vif.loc[vif['VIF'].idxmax(), 'Feature']
        features.remove(max_feature)
        print(f"Removed {max_feature} with VIF: {max_vif:.2f}")
    
    return features

# Perform iterative VIF removal
final_features = iterative_vif_removal(X, VIF_THRESHOLD)

# Report Final Feature Set

Display the final set of features with their VIF scores.

In [None]:
# Calculate final VIF scores
final_vif = calculate_vif(X[final_features])

print("Final Feature Set with VIF Scores:")
print(final_vif)

print("\nNumber of features removed:", len(numeric_features) - len(final_features))
print("Number of features retained:", len(final_features))
print("\nFinal features to use in SARIMAX model:")
for feature in final_features:
    print(f"- {feature}")