<a href="https://colab.research.google.com/github/Ruma13/Machine-Learning-Portfolio/blob/main/House_price_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

rumaaktertushi_house_prices_advanced_regression_techniques_path = kagglehub.dataset_download('rumaaktertushi/house-prices-advanced-regression-techniques')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Load & quick EDA**

(Exploratory Data Analysis)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test  = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# Peek at dataset
print("Train shape:", train.shape)
print("Test shape :", test.shape)

print("\n First 5 rows:")
display(train.head())

print("\n Info (data types + missing values):")
train.info()

print("\n Summary statistics:")
display(train.describe())

#  Correlation with SalePrice (numeric columns only)
numeric_cols = train.select_dtypes(include=[np.number])   # select only numeric features
corr = numeric_cols.corr()['SalePrice'].sort_values(ascending=False)

print("\n🔹 Top 10 correlated features with SalePrice:")
print(corr.head(10))

#  Quick heatmap of top correlations
top_corr_features = corr.index[:10]
plt.figure(figsize=(10,6))
sns.heatmap(train[top_corr_features].corr(), annot=True, cmap="coolwarm")
plt.title("Top Correlated Features with SalePrice")
plt.show()


**Feature engineering & preprocessing plan (high level)**

Create TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF.

Age feature: YearSold - YearBuilt.

Fill missing values: numeric → median; categorical → "Missing" or mode.

Encode categoricals with OneHotEncoder (handle unknowns).

Scale numeric features with StandardScaler.

Build a ColumnTransformer and an end-to-end Pipeline.

**Build preprocessing pipeline**

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# select feature subsets (example subset to keep this demo short)
num_features = ['LotArea','OverallQual','OverallCond','YearBuilt','TotalBsmtSF','GrLivArea']
cat_features = ['Neighborhood','ExterQual','KitchenQual','MSZoning']

# create derived column in original frames
for df in [train, test]:
    df['TotalSF'] = df['TotalBsmtSF'].fillna(0) + df['1stFlrSF'].fillna(0) + df.get('2ndFlrSF', 0).fillna(0)
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']

# update numeric list
num_features += ['TotalSF','HouseAge']

# numeric pipeline: impute median, then scale
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# categorical pipeline: impute constant then one-hot
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# select feature subsets (example subset to keep this demo short)
num_features = ['LotArea','OverallQual','OverallCond','YearBuilt','TotalBsmtSF','GrLivArea']
cat_features = ['Neighborhood','ExterQual','KitchenQual','MSZoning']

# create derived column in original frames
for df in [train, test]:
    df['TotalSF'] = df['TotalBsmtSF'].fillna(0) + df['1stFlrSF'].fillna(0) + df.get('2ndFlrSF', 0).fillna(0)
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']

# update numeric list
num_features += ['TotalSF','HouseAge']

# numeric pipeline: impute median, then scale
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# categorical pipeline: impute constant then one-hot
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


**Train/test split and baseline model**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import pandas as pd
import math

# Features and target
X = train[num_features + cat_features]
y = np.log1p(train['SalePrice'])  # log-transform

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store models and results
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42, n_jobs=-1),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
}

results = []

# Train each model and evaluate
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # your preprocessing pipeline
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)

    rmse = math.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)

    results.append({'Model': name, 'RMSE': round(rmse, 3), 'R² Score': round(r2, 2)})

# Convert results to DataFrame for clean table display
comparison_table = pd.DataFrame(results)
print(comparison_table)


**Try models: RandomForest and XGBoost (with hyperparam search)**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# pipeline with RandomForest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# a small grid for demo; expand for better results
param_dist = {
    'rf__n_estimators': [100, 250],
    'rf__max_depth': [10, 20, None],
    'rf__min_samples_split': [2, 5, 10]
}

rscv = RandomizedSearchCV(rf_pipeline, param_distributions=param_dist,
                          n_iter=6, scoring='neg_root_mean_squared_error',
                          cv=3, verbose=2, random_state=42)
rscv.fit(X_train, y_train)
print("Best params:", rscv.best_params_)
best_rf = rscv.best_estimator_

# eval
preds_rf = best_rf.predict(X_val)
print("RF RMSE:", math.sqrt(mean_squared_error(y_val, preds_rf)))


**XGBoost**

In [None]:
import xgboost as xgb
from sklearn.pipeline import Pipeline

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', xgb.XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))
])

xgb_pipeline.fit(X_train, y_train)
preds_xgb = xgb_pipeline.predict(X_val)
print("XGBoost RMSE:", math.sqrt(mean_squared_error(y_val, preds_xgb)))


**Save the best pipeline/model with joblib**

In [None]:
import joblib
# Suppose best model is best_rf (sklearn pipeline already includes preprocessor + model)
joblib.dump(best_rf, "house_price_pipeline.joblib")
# You can also save XGBoost pipeline or scaler separately if desired


**Load & predict example**

In [None]:
loaded = joblib.load("house_price_pipeline.joblib")
sample = X_val.iloc[0:1]                   # a DataFrame row
pred_log = loaded.predict(sample)          # prediction in log scale
pred_price = np.expm1(pred_log)            # back to original price scale
print("Predicted price:", pred_price[0])
