In [None]:
#📦 Insight: Necessary libraries are imported for data manipulation (Pandas, NumPy), visualization (Matplotlib/Seaborn), and model implementation (Scikit-learn).

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#📊 Insight: Dataset is loaded to explore the structure and perform preprocessing.



In [7]:
# Load data
df = pd.read_csv('housing.csv')

In [None]:
#🧠 Insight: EDA checks for data distribution, missing values, and correlation among features.

In [9]:
df.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

In [11]:
# Handle missing values
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)


In [13]:
# One-hot encode 'ocean_proximity'
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)

In [15]:
# Split features and target
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

In [None]:
#🛠️ Insight: Data is split into training and test sets to evaluate generalization.

In [17]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#⚖️ Insight: Lasso (Least Absolute Shrinkage and Selection Operator) adds L1 regularization to shrink coefficients and eliminate irrelevant features, promoting sparsity.

In [23]:
# Train Lasso Regression
lasso = Lasso(alpha=1.0)
lasso.fit(X_train_scaled, y_train)

In [25]:
# Predict
y_pred = lasso.predict(X_test_scaled)

In [None]:
#📈 Insight: Model performance is evaluated using metrics such as R² score, MSE, and possibly visualizations.

In [29]:
# Evaluate
print("Lasso Regression Results:")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Lasso Regression Results:
Mean Squared Error: 4908382747.104246
R² Score: 0.6254312334082688


In [31]:
# Coefficients
print("\nModel Coefficients:", lasso.coef_)
print("Model Intercept:", lasso.intercept_)


Model Coefficients: [-53797.75028931 -54386.67369765  13889.19876917 -13073.22587596
  43045.41252989 -43396.11340072  18377.55734347  75162.82671862
 -18514.69246299   2117.64610678  -1618.42318951   1140.96135222]
Model Intercept: 207194.69373788772


In [33]:
# Optional: Check which features were eliminated
features = X.columns
non_zero_features = features[lasso.coef_ != 0]
print("\n🎯 Features Lasso Kept:", list(non_zero_features))


🎯 Features Lasso Kept: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity_INLAND', 'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN']
