In [89]:
# Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import confusion_matrix as cm
import sklearn.metrics as skm
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from xgboost import XGBRegressor
import xgboost as xgb
import xgboost as xgb
import lightgbm as lgb

In [52]:
pd.set_option('display.max_columns', None)

### Load Data

In [77]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (1460, 81)
Test shape: (1459, 80)


### Combine Data

In [78]:
train_ID = train_df['Id']
test_ID = test_df['Id']

y_train = train_df['SalePrice']

# drop ID and target
train_df.drop(['Id','SalePrice'], axis=1, inplace=True)
test_df.drop(['Id'], axis=1, inplace=True)

all_data = pd.concat([train_df, test_df]).reset_index(drop=True)
print("Combined data shape:", all_data.shape)


Combined data shape: (2919, 79)


### Data Pre-processing

##### 1. Missing Values Handling

In [79]:
# Fill categorical missing with 'None'
for col in ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu',
            'GarageType','GarageFinish','GarageQual','GarageCond',
            'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','MasVnrType']:
    all_data[col] = all_data[col].fillna('None')

# Fill numerical missing with 0
for col in ['GarageYrBlt','GarageArea','GarageCars',
            'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
            'BsmtFullBath','BsmtHalfBath','MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

# Fill mode
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

# LotFrontage → fill by median of neighborhood
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))


##### 2. Feature Engineering

In [80]:
# Total square footage
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Total bathrooms
all_data['TotalBath'] = (all_data['FullBath'] + 0.5*all_data['HalfBath'] +
                         all_data['BsmtFullBath'] + 0.5*all_data['BsmtHalfBath'])

# Total porch area
all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] + all_data['EnclosedPorch'] +
                            all_data['3SsnPorch'] + all_data['ScreenPorch'])

# Convert some categorical ordinals to numbers
quality_map = {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
for col in ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual',
            'FireplaceQu','GarageQual','GarageCond']:
    all_data[col] = all_data[col].map(quality_map).fillna(0)


##### 3. Handling Skewness

In [81]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: x.dropna().skew()).sort_values(ascending=False)
skewed = skewed_feats[skewed_feats > 0.75].index

all_data[skewed] = np.log1p(all_data[skewed])

# Log-transform target variable too
y_train = np.log1p(y_train)

##### 4. Encoding Categoricals

In [82]:
all_data = pd.get_dummies(all_data)

print("Final all_data shape:", all_data.shape)

Final all_data shape: (2919, 268)


### Model Creation and Evaluation

In [83]:
# Split Data into Train and Test

X_train = all_data[:train_df.shape[0]]
X_test_final = all_data[train_df.shape[0]:]

print("X_train:", X_train.shape, "X_test:", X_test_final.shape)

X_train: (1460, 268) X_test: (1459, 268)


In [90]:
# Define models

models = {
    "Lasso": Lasso(alpha=0.0005, random_state=42, max_iter=10000),
    "Ridge": Ridge(alpha=10),
    "ElasticNet": ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42, max_iter=10000),
    "RandomForest": rfr(n_estimators=300, random_state=42),
    "GradientBoosting": gbr(n_estimators=3000, learning_rate=0.05,
                                                 max_depth=4, max_features='sqrt', 
                                                 min_samples_leaf=15, min_samples_split=10,
                                                 random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=3000, learning_rate=0.05,
                                max_depth=4, subsample=0.7,
                                colsample_bytree=0.7, random_state=42),
    "LightGBM": lgb.LGBMRegressor(objective='regression', num_leaves=5,
                                  learning_rate=0.05, n_estimators=3000,
                                  max_bin=200, bagging_fraction=0.8,
                                  bagging_freq=5, feature_fraction=0.8,
                                  random_state=42)
}

In [91]:
# Cross validation and evaluation

kf = KFold(n_splits=5, shuffle=True, random_state=42)

def rmse_cv(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse.mean()

results = {}
for name, model in models.items():
    score = rmse_cv(model, X_train, y_train)
    results[name] = score
    print(f"{name}: {score:.5f}")

Lasso: 0.12598
Ridge: 0.13066
ElasticNet: 0.12599
RandomForest: 0.14183
GradientBoosting: 0.12993
XGBoost: 0.12503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3295
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 171
[LightGBM] [Info] Start training from score 12.030658
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3293
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 175
[LightGBM] [Info] Start training from score 12.016898
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead 

In [92]:
# Picking the best model based on RMSE

best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

print(f"\nBest Model: {best_model_name} with RMSE = {results[best_model_name]:.5f}")



Best Model: XGBoost with RMSE = 0.12503


In [None]:
# Train best model on full data
best_model.fit(X_train, y_train)
preds = best_model.predict(X_test_final)
final_preds = np.expm1(preds)

submission = pd.DataFrame({"Id": test_ID, "SalePrice": final_preds})
submission.to_csv("submission.csv", index=False)

print("✅ submission.csv saved with best model predictions!")

✅ submission.csv saved with best model predictions!


### Ensemble Approach

In [94]:
# ===============================
# 1. Train all models individually
# ===============================
for name, model in models.items():
    score = rmse_cv(model, X_train, y_train)
    results[name] = score
    print(f"{name}: {score:.5f}")

# Sort models by CV score
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]))
print("\nModel Ranking (by RMSE):")
for k, v in sorted_results.items():
    print(f"{k}: {v:.5f}")

# ===============================
# 2. Train Top 3 Models Fully
# ===============================
top3_names = list(sorted_results.keys())[:3]
print(f"\nTop 3 Models: {top3_names}")

trained_models = {}
for name in top3_names:
    model = models[name]
    model.fit(X_train, y_train)
    trained_models[name] = model

# ===============================
# 3. Make Predictions
# ===============================
predictions = []
for name, model in trained_models.items():
    preds = model.predict(X_test_final)
    preds = np.expm1(preds)  # inverse log-transform
    predictions.append(preds)

# ===============================
# 4. Blend Predictions
# ===============================
# Simple average ensemble
final_preds = (predictions[0] + predictions[1] + predictions[2]) / 3

# Weighted average (give higher weight to best model)
weights = [0.5, 0.3, 0.2]   # adjust if needed
weighted_preds = (weights[0]*predictions[0] +
                  weights[1]*predictions[1] +
                  weights[2]*predictions[2])

# ===============================
# 5. Save Submissions
# ===============================
submission_avg = pd.DataFrame({"Id": test_ID, "SalePrice": final_preds})
submission_avg.to_csv("submission_avg.csv", index=False)

submission_weighted = pd.DataFrame({"Id": test_ID, "SalePrice": weighted_preds})
submission_weighted.to_csv("submission_weighted.csv", index=False)

print("✅ Saved: submission_avg.csv and submission_weighted.csv")


Lasso: 0.12598
Ridge: 0.13066
ElasticNet: 0.12599
RandomForest: 0.14183
GradientBoosting: 0.12993
XGBoost: 0.12503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3295
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 171
[LightGBM] [Info] Start training from score 12.030658
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3293
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 175
[LightGBM] [Info] Start training from score 12.016898
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead 