# DL Assignment 8
## Sargun Singh (102115078) 4O1D

**Q1** *Use the appropriate dataset to realize multiple linear regression and show that how various statistical tools like p-value, t-score etc can be used for dimensionality reduction*

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

In [2]:
# Load the California Housing Dataset
data = fetch_california_housing(as_frame=True)
X = data.data  # Features
y = data.target  # Target variable (Median house value)

In [3]:
# Add a constant for the intercept in statsmodels
X = sm.add_constant(X)

In [4]:
# Use statsmodels for detailed statistical analysis
model = sm.OLS(y, X).fit()

In [5]:
# Display the summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     3970.
Date:                Wed, 20 Nov 2024   Prob (F-statistic):               0.00
Time:                        00:38:51   Log-Likelihood:                -22624.
No. Observations:               20640   AIC:                         4.527e+04
Df Residuals:                   20631   BIC:                         4.534e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -36.9419      0.659    -56.067      0.0

In [6]:
# Features with high p-values (> 0.05) are less significant
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print("\nSignificant Features (p-value < 0.05):", significant_features)


Significant Features (p-value < 0.05): ['const', 'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'AveOccup', 'Latitude', 'Longitude']


In [7]:
# Create a reduced dataset with only significant features
X_reduced = X[significant_features]

In [8]:
# Fit the Model Again with Reduced Features
model_reduced = sm.OLS(y, X_reduced).fit()

In [9]:
# Display the summary of the reduced model
print("\nReduced Model Summary:")
print(model_reduced.summary())


Reduced Model Summary:
                            OLS Regression Results                            
Dep. Variable:            MedHouseVal   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     4538.
Date:                Wed, 20 Nov 2024   Prob (F-statistic):               0.00
Time:                        00:39:19   Log-Likelihood:                -22624.
No. Observations:               20640   AIC:                         4.526e+04
Df Residuals:                   20632   BIC:                         4.533e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -36.9175      0

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X.drop(columns="const"), y, test_size=0.2, random_state=42)
X_train_reduced = X_train[significant_features[1:]]  # Exclude 'const'
X_test_reduced = X_test[significant_features[1:]]

In [11]:
# Fit models
from sklearn.linear_model import LinearRegression

full_model = LinearRegression().fit(X_train, y_train)
reduced_model = LinearRegression().fit(X_train_reduced, y_train)

In [12]:
# Predictions
y_pred_full = full_model.predict(X_test)
y_pred_reduced = reduced_model.predict(X_test_reduced)

In [13]:
# Metrics
print("\nFull Model Evaluation:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_full)):.4f}")
print(f"R2 Score: {r2_score(y_test, y_pred_full):.4f}")

print("\nReduced Model Evaluation:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_reduced)):.4f}")
print(f"R2 Score: {r2_score(y_test, y_pred_reduced):.4f}")


Full Model Evaluation:
RMSE: 0.7456
R2 Score: 0.5758

Reduced Model Evaluation:
RMSE: 0.7456
R2 Score: 0.5757
