In [1]:
import pandas as pd                        # For data handling and manipulation
from statsmodels.stats.outliers_influence import variance_inflation_factor  # For calculating VIF
from statsmodels.tools.tools import add_constant  # To add a constant term for regression (intercept)


In [2]:
# Load the newly uploaded dataset
file_path = './SOUTHBANK.csv'
southbank_data = pd.read_csv(file_path)

# Step 1: Select the relevant features for VIF analysis
features = ['OPEN', 'HIGH', 'LOW', 'CLOSE', 'TOTTRDQTY', 'TOTTRDVAL']
X = southbank_data[features]

# Step 2: Add a constant term for VIF calculation
X = add_constant(X)

# Step 3: Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display the VIF values
vif_data

Unnamed: 0,Feature,VIF
0,const,4.092744
1,OPEN,6054.186052
2,HIGH,9479.768142
3,LOW,8029.882751
4,CLOSE,8739.960973
5,TOTTRDQTY,27.832106
6,TOTTRDVAL,28.777058


### 1. **VIF value for each relevant feature:**

| Feature           | VIF          |
|-------------------|--------------|
| **VIF_OPEN**      | 6054.19      |
| **VIF_HIGH**      | 9479.77      |
| **VIF_LOW**       | 8029.88      |
| **VIF_CLOSE**     | 8739.96      |
| **VIF_TOTTRDQTY** | 27.83        |
| **VIF_TOTTRDVAL** | 28.78        |

---

In [3]:
# Select only the relevant features for VIF analysis (OPEN, HIGH, LOW, CLOSE)
features = ['OPEN', 'HIGH', 'LOW', 'CLOSE']
X = southbank_data[features]

# Add a constant term for VIF calculation
X = add_constant(X)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display the VIF values
vif_data


Unnamed: 0,Feature,VIF
0,const,2.27711
1,OPEN,5825.61637
2,HIGH,8595.870844
3,LOW,7533.768555
4,CLOSE,8615.009974


In [4]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Reload the dataset
file_path = './SOUTHBANK.csv'
southbank_data = pd.read_csv(file_path)

# Step 1: Select the relevant features for VIF analysis (OPEN, HIGH, LOW, CLOSE)
features = ['OPEN', 'HIGH', 'LOW', 'CLOSE']
X = southbank_data[features]

# Add a constant term for VIF calculation
X = add_constant(X)

# Function to calculate VIF and drop the feature with the highest VIF
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data['Feature'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Step 2: First VIF calculation and remove the feature with the highest VIF
vif_data_1 = calculate_vif(X)
print("VIF values before removing the highest VIF feature:")
print(vif_data_1)

# Remove the feature with the highest VIF (excluding constant)
feature_to_remove_1 = vif_data_1.sort_values(by="VIF", ascending=False).iloc[0]['Feature']
X_step1 = X.drop(columns=[feature_to_remove_1])

# Step 3: Second VIF calculation and remove the next feature with the highest VIF
vif_data_2 = calculate_vif(X_step1)
print("\nVIF values after removing the first feature:")
print(vif_data_2)

# Remove the feature with the highest VIF (excluding constant)
feature_to_remove_2 = vif_data_2.sort_values(by="VIF", ascending=False).iloc[0]['Feature']
X_step2 = X_step1.drop(columns=[feature_to_remove_2])

# Print remaining features
print("\nRemaining features after two steps of VIF removal:", X_step2.columns)

# print new VIF values
vif_data_3 = calculate_vif(X_step2)
print("\nVIF values after removing the second feature:")
print(vif_data_3)


VIF values before removing the highest VIF feature:
  Feature          VIF
0   const     2.277110
1    OPEN  5825.616370
2    HIGH  8595.870844
3     LOW  7533.768555
4   CLOSE  8615.009974

VIF values after removing the first feature:
  Feature          VIF
0   const     2.271385
1    OPEN  3400.944626
2    HIGH  3369.858730
3     LOW  2667.510124

Remaining features after two steps of VIF removal: Index(['const', 'HIGH', 'LOW'], dtype='object')

VIF values after removing the second feature:
  Feature          VIF
0   const     2.271351
1    HIGH  2132.334741
2     LOW  2132.334741
