<a href="https://colab.research.google.com/github/OlyMahmudMugdho/supervised-learning-notes/blob/main/ridge_regression_note.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
olymahmud_housing_prices_dataset_path = kagglehub.dataset_download('olymahmud/housing-prices-dataset')

print('Data source import complete.')


In [None]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# 2. Load Kaggle dataset
df = pd.read_csv("/kaggle/input/housing-prices-dataset/housing_prices_dataset.csv")

In [None]:

# 3. Quick data check
print(df.shape)
print(df.info())
print(df.describe())
df['price'] = df['price'].replace([np.inf, -np.inf], np.nan)

In [None]:
# 4. Convert categorical variables to numeric for correlation heatmap
df_numeric = df.copy()
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df_numeric[col] = df_numeric[col].astype('category').cat.codes

In [None]:

# 5. Visualization: Correlation Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap (Numeric + Encoded Categorical)")
plt.show()

In [None]:
# 6. Distribution of Target
plt.figure(figsize=(8,5))
sns.histplot(df['price'], bins=30, kde=True)
plt.title("Distribution of House Prices")
plt.show()

In [None]:
# 7. Feature preprocessing
# Convert categorical variables to dummies for modeling
df_model = pd.get_dummies(df, drop_first=True)

# Split features and target
X = df_model.drop("price", axis=1)
y = df_model["price"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# 8. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
# 9. Train Ridge Regression
ridge = Ridge(alpha=10)  # alpha = regularization strength
ridge.fit(X_train, y_train)

# Predict
y_pred = ridge.predict(X_test)

In [None]:
# 10. Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Ridge Regression Results:")
print("MSE:", mse)
print("RMSE:", rmse)
print("R² Score:", r2)

In [None]:
    # 11. Visualization: Predicted vs Actual
    plt.figure(figsize=(8,6))
    plt.scatter(y_test, y_pred, alpha=0.7, color="blue")
    plt.xlabel("Actual Prices")
    plt.ylabel("Predicted Prices")
    plt.title("Ridge Regression: Actual vs Predicted Prices")
    plt.plot([y.min(), y.max()], [y.min(), y.max()], color="red", linestyle="--")
    plt.show()

In [None]:
# 12. Effect of different alpha values
alphas = [0.01, 0.1, 1, 10, 50, 100]
r2_scores = []

for a in alphas:
    ridge_cv = Ridge(alpha=a)
    ridge_cv.fit(X_train, y_train)
    r2_scores.append(ridge_cv.score(X_test, y_test))

plt.figure(figsize=(8,5))
plt.plot(alphas, r2_scores, marker='o')
plt.xscale('log')
plt.xlabel("Alpha")
plt.ylabel("R² Score")
plt.title("Effect of Alpha on Ridge Regression Performance")
plt.show()

In [None]:
# 13. Coefficients
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Ridge_Coefficient": ridge.coef_
})
print("\nTop 10 Features by Absolute Coefficient:")
print(coef_df.reindex(coef_df["Ridge_Coefficient"].abs().sort_values(ascending=False).index).head(10))