In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE

# [1] Load dataset
df = pd.read_csv("C:\\Users\\maths\\Downloads\\weatherAUS.csv")

# [2] Drop irrelevant columns (like date, location, and target leakage feature 'RISK_MM')
drop_columns = ["Date", "Location", "RISK_MM"]
df.drop(columns=drop_columns, inplace=True)

# [3] Remove rows with missing values
df.dropna(inplace=True)

# [4] Convert categorical columns into numeric (RainToday, RainTomorrow)
df = pd.get_dummies(df, drop_first=True)

# [5] Define features (X) and target variable (y)
X = df.drop(columns=['RainTomorrow_Yes'])
y = df['RainTomorrow_Yes']

# [6] Function to compute VIF for multicollinearity check
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# [7] Compute VIF before feature engineering
vif_before = calculate_vif(X)
print("\n===== VIF Before Feature Engineering =====\n", vif_before)

# [8] Drop features where VIF is extremely high (>10) or infinite
high_vif_features = vif_before[vif_before["VIF"] > 10]["Feature"].tolist()
X.drop(columns=high_vif_features, inplace=True)

# [9] Compute VIF after dropping high VIF features
vif_after = calculate_vif(X)
print("\n===== VIF After Feature Engineering =====\n", vif_after)

# [10] Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# [11] Recursive Feature Elimination (RFE) to select the best features
lin_reg = LinearRegression()
rfe = RFE(lin_reg, n_features_to_select=10)  # Keep top 10 features
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# [12] Train regression model with best-selected features
lin_reg.fit(X_train_rfe, y_train)
y_pred = lin_reg.predict(X_test_rfe)

# [13] Evaluate Regression Model
print("\n===== Regression After Feature Selection =====")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")



===== VIF Before Feature Engineering =====
            Feature         VIF
0          MinTemp   60.939279
1          MaxTemp  621.691511
2         Rainfall    1.632953
3      Evaporation    7.388729
4         Sunshine   17.533825
..             ...         ...
57   WindDir3pm_SW    2.711793
58    WindDir3pm_W    2.703251
59  WindDir3pm_WNW    2.377855
60  WindDir3pm_WSW    2.625586
61   RainToday_Yes    2.244707

[62 rows x 2 columns]
