In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.datasets import fetch_california_housing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
import pandas as pd
seed = 42
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

vif_data = vif_data.sort_values("VIF", ascending=False)
vif_data

Unnamed: 0,feature,VIF
7,Longitude,633.711654
6,Latitude,559.874071
2,AveRooms,45.993601
3,AveBedrms,43.590314
0,MedInc,11.51114
1,HouseAge,7.195917
4,Population,2.935745
5,AveOccup,1.095243


In [14]:
# Standardizing the features for PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying PCA to reduce multicollinearity
pca = PCA(n_components=0.95)  # retain 95% of the variance
X_pca = pca.fit_transform(X_scaled)

# Applying Ridge Regression to address multicollinearity
ridge = Ridge(alpha=1.0)
ridge.fit(X_scaled, y)

# Applying Lasso Regression to address multicollinearity
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

# Displaying the number of components after PCA
n_components_pca = X_pca.shape[1]

# Displaying non-zero coefficients from Ridge and Lasso
non_zero_ridge = sum(ridge.coef_ != 0)
non_zero_lasso = sum(lasso.coef_ != 0)

(n_components_pca, non_zero_ridge, non_zero_lasso)

(6, 8, 3)

In [24]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import statsmodels.api as sm


# Fetching the California housing dataset
X, y = fetch_california_housing(return_X_y=True, as_frame=True)


# Standardizing the features for PCA
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X),columns = X.columns)



In [36]:
model = sm.OLS( y, X_scaled )
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,MedHouseVal,R-squared (uncentered):,0.144
Model:,OLS,Adj. R-squared (uncentered):,0.144
Method:,Least Squares,F-statistic:,433.4
Date:,"Fri, 05 Jan 2024",Prob (F-statistic):,0.0
Time:,07:36:57,Log-Likelihood:,-45482.0
No. Observations:,20640,AIC:,90980.0
Df Residuals:,20632,BIC:,91040.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MedInc,0.8296,0.024,34.379,0.000,0.782,0.877
HouseAge,0.1188,0.017,6.986,0.000,0.085,0.152
AveRooms,-0.2655,0.044,-6.025,0.000,-0.352,-0.179
AveBedrms,0.3057,0.040,7.575,0.000,0.227,0.385
Population,-0.0045,0.016,-0.277,0.782,-0.036,0.027
AveOccup,-0.0393,0.015,-2.567,0.010,-0.069,-0.009
Latitude,-0.8999,0.047,-19.342,0.000,-0.991,-0.809
Longitude,-0.8705,0.046,-19.058,0.000,-0.960,-0.781

0,1,2,3
Omnibus:,4393.65,Durbin-Watson:,0.097
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.596
Skew:,1.082,Prob(JB):,0.0
Kurtosis:,6.42,Cond. No.,6.67


In [38]:
results = model.fit_regularized(method='elastic_net', alpha=0.0, L1_wt=0.3, refit=True)
results.summary()

0,1,2,3
Dep. Variable:,MedHouseVal,R-squared (uncentered):,0.144
Model:,OLS,Adj. R-squared (uncentered):,0.144
Method:,Least Squares,F-statistic:,433.4
Date:,"Fri, 05 Jan 2024",Prob (F-statistic):,0.0
Time:,07:38:14,Log-Likelihood:,-45482.0
No. Observations:,20640,AIC:,90980.0
Df Residuals:,20632,BIC:,91040.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MedInc,0.8296,0.024,34.379,0.000,0.782,0.877
HouseAge,0.1188,0.017,6.986,0.000,0.085,0.152
AveRooms,-0.2655,0.044,-6.025,0.000,-0.352,-0.179
AveBedrms,0.3057,0.040,7.575,0.000,0.227,0.385
Population,-0.0045,0.016,-0.277,0.782,-0.036,0.027
AveOccup,-0.0393,0.015,-2.567,0.010,-0.069,-0.009
Latitude,-0.8999,0.047,-19.342,0.000,-0.991,-0.809
Longitude,-0.8705,0.046,-19.058,0.000,-0.960,-0.781

0,1,2,3
Omnibus:,4393.65,Durbin-Watson:,0.097
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.596
Skew:,1.082,Prob(JB):,0.0
Kurtosis:,6.42,Cond. No.,6.67


In [45]:
pca = PCA(n_components=0.90)
X_pca = pd.DataFrame(pca.fit_transform(X_scaled))
model = sm.OLS( y, X_pca )
results = model.fit_regularized(method='elastic_net', alpha=0.0, L1_wt=0.3, refit=True)
results.summary()

0,1,2,3
Dep. Variable:,MedHouseVal,R-squared (uncentered):,0.108
Model:,OLS,Adj. R-squared (uncentered):,0.108
Method:,Least Squares,F-statistic:,499.1
Date:,"Fri, 05 Jan 2024",Prob (F-statistic):,0.0
Time:,07:41:51,Log-Likelihood:,-45907.0
No. Observations:,20640,AIC:,91820.0
Df Residuals:,20635,BIC:,91860.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
0.0,0.0282,0.011,2.573,0.010,0.007,0.050
1.0,0.1363,0.011,12.004,0.000,0.114,0.159
2.0,0.0398,0.014,2.883,0.004,0.013,0.067
3.0,-0.7414,0.015,-48.335,0.000,-0.771,-0.711
4.0,0.0039,0.016,0.249,0.803,-0.027,0.034

0,1,2,3
Omnibus:,4002.725,Durbin-Watson:,0.098
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8393.113
Skew:,1.145,Prob(JB):,0.0
Kurtosis:,5.125,Cond. No.,1.42


In [None]:
# Fitting Ridge Regression
ridge = Ridge(alpha=0.5)
X_scaled = pd.DataFrame(scaler.fit_transform(X),columns = X.columns)

ridge.fit(X_scaled, y)

In [None]:
# Standardizing the features for PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying PCA
pca = PCA(n_components=0.80)
X_pca = pca.fit_transform(X_scaled)

# Fitting Ridge Regression
ridge = Ridge(alpha=0.5)
ridge.fit(X_scaled, y)

# Fitting Lasso Regression
lasso = Lasso(alpha=0.5)
lasso.fit(X_scaled, y)

In [None]:

# Original linear regression model
lr_original = LinearRegression()
lr_original.fit(X_scaled, y)
original_pred = lr_original.predict(X)
original_mse = mean_squared_error(y, original_pred)
original_se = np.sqrt(original_mse)

# Linear regression model using features selected by PCA
lr_pca = LinearRegression()
lr_pca.fit(X_pca, y)
pca_pred = lr_pca.predict(X_pca)
pca_mse = mean_squared_error(y, pca_pred)
pca_se = np.sqrt(pca_mse)

# Linear regression model using features from Ridge regression
ridge_pred = ridge.predict(X_scaled)
ridge_mse = mean_squared_error(y, ridge_pred)
ridge_se = np.sqrt(ridge_mse)

# Linear regression model using features from Lasso regression
lasso_pred = lasso.predict(X_scaled)
lasso_mse = mean_squared_error(y, lasso_pred)
lasso_se = np.sqrt(lasso_mse)

(original_se, pca_se, ridge_se, lasso_se)