<a href="https://colab.research.google.com/github/Venura-Shiromal/Ai-session-term1/blob/main/Ai_MachineLearning/Regression_Example_Home_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
yasserh_housing_prices_dataset_path = kagglehub.dataset_download('yasserh/housing-prices-dataset')

print('Data source import complete.')


# Setup

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Read the dataset
pd.set_option('display.precision', 2)
df = pd.read_csv('/kaggle/input/housing-prices-dataset/Housing.csv')

# Data

In [None]:
# Select relevant features and target variable
columns = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'basement', 'guestroom']
df = df[columns]

In [None]:
# Inspect data rows
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Create plot for each data feature against price
features = ['area', 'bedrooms', 'bathrooms', 'stories', 'basement', 'guestroom']
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Home Price vs Features', fontsize=16)

axes = axes.ravel()
for idx, feature in enumerate(features):
    axes[idx].scatter(df['price'], df[feature], alpha=0.5)
    axes[idx].set_xlabel('Home Price ($)')
    axes[idx].set_ylabel(feature.capitalize())
    axes[idx].set_title(f'Price vs {feature.capitalize()}')

    # Format price labels
    axes[idx].xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

plt.tight_layout()
plt.show()

# Preprocessing

In [None]:
# Convert non-numeric columns to numerical features
categorial_cols = ['basement', 'guestroom']
df[categorial_cols] = df[categorial_cols].map(lambda x: {'yes': 1, 'no': 0}[x])

In [None]:
df.head()

# Feature Extraction

In [None]:
# Separate features and target variable
X = df.drop('price', axis=1)
y = df['price']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

# Train/Test Split

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=45)

print("Training set shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("\nTesting set shapes:")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

# Model Training

## $$ \hat{y}(w, x) = w_0 + w_1 x_1 + ... + w_p x_p $$

LinearRegression fits a linear model with coefficients $ w = (w_1, ..., w_p) $ (weights) to minimize the residual sum of squares (loss function) between the observed targets (labels) in the dataset, and the targets predicted by the linear approximation (model predictions.

Uses Ordinary Least Squares as optimization function.

[Ordinary Least Squares](https://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares)

In [None]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluation

In [None]:
# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate evaluation metric
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: ${mae:,.2f}") # average dollar error

In [None]:
actual_prices = y_test
predicted_prices = y_pred

# Create list of difference between actual and predicted prices
difference = (predicted_prices - actual_prices)

In [None]:
# Generate plot comparing actual prices to predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(actual_prices, predicted_prices, alpha=0.5)
plt.plot([actual_prices.min(), actual_prices.max()], [actual_prices.min(), actual_prices.max()], 'r--', lw=2)
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title('Actual vs Predicted House Prices')

# Format axis labels to show prices in millions
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

plt.tight_layout()
plt.show()

In [None]:
# Generate plot comparing actual prices to prediction differences
plt.figure(figsize=(10, 6))
plt.scatter(actual_prices, difference, alpha=0.5)
plt.plot([actual_prices.min(), actual_prices.max()], [0, 0], 'r--', lw=2)
plt.xlabel('Actual Price ($)')
plt.ylabel('Amount Difference ($)')
plt.title('Actual Price vs Predicted Difference')

# Format axis labels to show prices in millions
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

plt.tight_layout()
plt.show()