# VIF Analysis for SARIMAX Model Features
This notebook performs Variance Inflation Factor (VIF) analysis to address multicollinearity in the feature set used for SARIMAX modeling.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
import matplotlib.pyplot as plt

# Load and Prepare Data
Load the dataset containing exogenous predictors from the previous SARIMAX model.

In [None]:
# Load the dataset
# Assuming the data is stored in a CSV file
data = pd.read_csv('sarimax_features.csv')
print("Dataset shape:", data.shape)
print("\nFeatures in dataset:")
print(data.columns.tolist())

# Calculate Variance Inflation Factor (VIF)
Calculate VIF for each predictor to identify multicollinearity.

In [None]:
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                       for i in range(X.shape[1])]
    return vif_data.sort_values('VIF', ascending=False)

# Calculate initial VIF values
X = data.select_dtypes(include=[np.number])  # Select only numeric columns
initial_vif = calculate_vif(X)
print("Initial VIF values:")
print(initial_vif)

# Identify High VIF Features
Identify features with VIF scores above the threshold of 10.

In [None]:
VIF_THRESHOLD = 10

high_vif_features = initial_vif[initial_vif['VIF'] > VIF_THRESHOLD]
print("Features with VIF > {}:".format(VIF_THRESHOLD))
print(high_vif_features)

# Visualize VIF scores
plt.figure(figsize=(10, 6))
sns.barplot(data=initial_vif, x='Feature', y='VIF')
plt.xticks(rotation=45)
plt.title('VIF Scores for All Features')
plt.show()

# Iterative Feature Removal
Iteratively remove features with high VIF scores until all remaining features have VIF < 10.

In [None]:
def iterative_vif_removal(X, threshold=10):
    features = list(X.columns)
    removed_features = []
    
    while True:
        vif = calculate_vif(X)
        max_vif = vif['VIF'].max()
        
        if max_vif < threshold:
            break
            
        feature_to_remove = vif.loc[vif['VIF'] == max_vif, 'Feature'].iloc[0]
        removed_features.append((feature_to_remove, max_vif))
        features.remove(feature_to_remove)
        X = X[features]
    
    return X, removed_features

# Perform iterative VIF removal
X_final, removed_features = iterative_vif_removal(X, VIF_THRESHOLD)

# Report Final Feature Set
Display the final set of features with their VIF scores, suitable for SARIMAX modeling.

In [None]:
# Calculate final VIF scores
final_vif = calculate_vif(X_final)

print("Removed features and their VIF scores:")
for feature, vif in removed_features:
    print(f"{feature}: {vif:.2f}")

print("\nFinal feature set with VIF scores:")
print(final_vif)

# Save final feature set
final_features = X_final.columns.tolist()
pd.DataFrame({'features': final_features}).to_csv('final_sarimax_features.csv', index=False)
print("\nFinal features saved to 'final_sarimax_features.csv'")