In [None]:
# 🧠 Insight: We begin by importing essential libraries — NumPy for numerical operations, Pandas for data handling, Matplotlib/Seaborn for visualization, 
# and Scikit-learn for model implementation and evaluation.

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [None]:
# 📊 Insight: The dataset is loaded using Pandas. Make sure the path is correct if you're running this notebook locally or online

In [29]:
data = pd.read_csv("housing.csv")

In [31]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [None]:
# 📈 Insight: Exploratory Data Analysis (EDA) helps understand data distribution, check for null values, and perform initial statistical analysis.

In [35]:
data.isnull().any()


longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

In [37]:
data['total_bedrooms'].fillna(data['total_bedrooms'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['total_bedrooms'].fillna(data['total_bedrooms'].median(), inplace=True)


In [None]:
# 🔍 Insight: Data preprocessing involves feature scaling to normalize the input variables and splitting into train-test sets to evaluate model performance objectively.

In [41]:
#one hot encoding to change the cateogerical data into numerical
data = pd.get_dummies(data, columns=['ocean_proximity'], drop_first=True)

In [45]:
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 🧮 Insight: Ridge Regression is used to tackle multicollinearity by penalizing large coefficients using L2 regularization. 
# It helps reduce overfitting by adding a penalty term α * (sum of squared coefficients) to the cost function.

In [51]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)


In [53]:
y_pred = ridge.predict(X_test_scaled)

In [None]:
# 📉 Insight: Evaluation metrics like R² score and Mean Squared Error (MSE) help determine how well the model performs on unseen data.

In [55]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 4908041658.447794
R² Score: 0.625457262584054


In [57]:
print("Model Coefficients:", ridge.coef_)
print("Model Intercept:", ridge.intercept_)

Model Coefficients: [-53713.65159257 -54298.40041519  13890.4723292  -13058.64374116
  42978.32840348 -43389.96877929  18425.41255042  75157.63575176
 -18538.72553023   2119.24916563  -1616.22226055   1146.56402572]
Model Intercept: 207194.69373788772
