<a href="https://colab.research.google.com/github/Olanle/Project-006-House-price-predictor-full-pipeline-with-deployed-notebook/blob/main/006.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#Load & Inspect House Price Dataset

import pandas as pd
import numpy as np

df = pd.read_csv("/content/Housing.csv")

In [4]:
print("Rows, Columns:", df.shape)

print("\n--- Head ---")
print(df.head())


print("\n--- Info ---")
df.info()

print("\n--- Numeric Describe ---")
print(df.describe().T)

print("\n--- Missing values ---")
missing = df.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])

Rows, Columns: (545, 13)

--- Head ---
      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  

--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeInd

In [5]:
#Check target distribution / summary
if 'price' in df.columns:
    print("\n--- Target (SalePrice) stats ---")
    print(df['price'].describe())

    # quick skew check
    print("Skew:", df['price'].skew())
else:
    print("\nWARNING: 'price' not found. Set the correct target column name.")

#Show categorical columns and a sample of unique values (first 10)
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns ({len(cat_cols)}): {cat_cols}")

print("\n")
for c in cat_cols:
    print(f" - {c}: {df[c].nunique()} uniques; sample -> {df[c].dropna().unique()[:10]}")


--- Target (SalePrice) stats ---
count    5.450000e+02
mean     4.766729e+06
std      1.870440e+06
min      1.750000e+06
25%      3.430000e+06
50%      4.340000e+06
75%      5.740000e+06
max      1.330000e+07
Name: price, dtype: float64
Skew: 1.2122388370279802

Categorical columns (7): ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']


 - mainroad: 2 uniques; sample -> ['yes' 'no']
 - guestroom: 2 uniques; sample -> ['no' 'yes']
 - basement: 2 uniques; sample -> ['no' 'yes']
 - hotwaterheating: 2 uniques; sample -> ['no' 'yes']
 - airconditioning: 2 uniques; sample -> ['yes' 'no']
 - prefarea: 2 uniques; sample -> ['yes' 'no']
 - furnishingstatus: 3 uniques; sample -> ['furnished' 'semi-furnished' 'unfurnished']


In [6]:
#Define Features, Target & Split Data

from sklearn.model_selection import train_test_split

target = 'price'

#Drop identifier or irrelevant columns
drop_cols = ['Id'] if 'Id' in df.columns else []

#Define features (everything except target + dropped columns)
X = df.drop(columns=[target] + drop_cols)
y = df[target]

#Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training data: {X_train.shape}")
print(f"Testing data:  {X_test.shape}")


Training data: (436, 12)
Testing data:  (109, 12)


In [7]:
#Preprocessing Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Separate feature types
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_features = X_train.select_dtypes(include=['object']).columns

# Numeric pipeline
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

print("Preprocessing pipeline created successfully!")
print(f"Numerical features: {len(num_features)}")
print(f"Categorical features: {len(cat_features)}")


Preprocessing pipeline created successfully!
Numerical features: 5
Categorical features: 7


In [8]:
#Model Setup & Pipeline Integration

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

#Linear Regression pipeline
linreg_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

#Random Forest pipeline
rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

#XGBoost pipeline
xgb_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
])

print("Model pipelines (Linear, Random Forest, XGBoost) created successfully!")


Model pipelines (Linear, Random Forest, XGBoost) created successfully!


In [12]:
#Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Define parameter grids
rf_params = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5]
}

xgb_params = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__max_depth': [3, 5, 7]
}

# GridSearch for Random Forest
rf_grid = GridSearchCV(rf_pipe, rf_params, cv=3, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# GridSearch for XGBoost
xgb_grid = GridSearchCV(xgb_pipe, xgb_params, cv=3, scoring='r2', n_jobs=-1)
xgb_grid.fit(X_train, y_train)

# Fit the linear regression pipeline
linreg_pipe.fit(X_train, y_train)

print("Grid search completed!")
print(f"Best RF params: {rf_grid.best_params_}")
print(f"Best XGB params: {xgb_grid.best_params_}")


Grid search completed!
Best RF params: {'regressor__max_depth': 10, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 100}
Best XGB params: {'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__n_estimators': 100}


In [10]:
#Model Evaluation & Comparison
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Evaluate all models
models = {
    "Linear Regression": linreg_pipe,
    "Random Forest": rf_grid.best_estimator_,
    "XGBoost": xgb_grid.best_estimator_
}

results = {}

for name, model in models.items():
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))

    results[name] = {"R²": r2, "MAE": mae, "RMSE": rmse}

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

                         R²           MAE          RMSE
Linear Regression  0.652924  9.700434e+05  1.324507e+06
Random Forest      0.606173  1.029936e+06  1.410895e+06
XGBoost            0.645913  9.888418e+05  1.337819e+06


In [11]:
#Saving the Best Model
import joblib

# Let's say XGBoost performed best
best_model = xgb_grid.best_estimator_

# Save model and preprocessor pipeline together
joblib.dump(best_model, "house_price_predictor.pkl")

print("Model saved successfully as 'house_price_predictor.pkl'")


Model saved successfully as 'house_price_predictor.pkl'
