



# LightGBM Regressor (Gradient Boosting Decision Trees)

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor, early_stopping, log_evaluation

## Load the data

In [3]:
REPO_ROOT = Path.cwd().parent  # notebook in Notebooks/
TRAIN_PATH = REPO_ROOT / "Data" / "Processed" / "train_ohe_knn_fixed.csv"
TEST_PATH  = REPO_ROOT / "Data" / "Processed" / "test_ohe_knn_fixed.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

print("Train shape:", train.shape)  # expect (1460, 305)
print("Test shape :", test.shape)   # expect (1459, 304)
print("SalePrice in train?", "SalePrice" in train.columns)
print("SalePrice in test? ", "SalePrice" in test.columns)
print("Id in train?", "Id" in train.columns)
print("Id in test? ", "Id" in test.columns)


Train shape: (1460, 305)
Test shape : (1459, 304)
SalePrice in train? True
SalePrice in test?  False
Id in train? True
Id in test?  True


## Build X/y and REMOVE Id from features

In [4]:
# Keep Id ONLY for submission
test_ids = test["Id"]

# Features/target
X = train.drop(columns=["SalePrice"])
y = np.log1p(train["SalePrice"])

# Drop Id from features (important)
X = X.drop(columns=["Id"])
X_test = test.drop(columns=["Id"])

# Ensure same feature columns and same order
print("Mismatch columns (should be 0):", len(set(X.columns) ^ set(X_test.columns)))
X_test = X_test[X.columns]

Mismatch columns (should be 0): 0


## Train/validation split

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Model

In [6]:
model = LGBMRegressor(
    n_estimators=10000,
    learning_rate=0.01,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    force_row_wise=True,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="rmse",
    callbacks=[
        early_stopping(stopping_rounds=300),
        log_evaluation(period=0)  # change to 50 if you want logs
    ]
)

[LightGBM] [Info] Total Bins 3394
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 193
[LightGBM] [Info] Start training from score 12.030658
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[2107]	valid_0's rmse: 0.136299	valid_0's l2: 0.0185773


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.01
,n_estimators,10000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


## Validate

In [7]:
val_pred_log = model.predict(X_val)
rmse_log = np.sqrt(mean_squared_error(y_val, val_pred_log))
print("Validation RMSE (log scale):", rmse_log)
print("Best iteration:", model.best_iteration_)

Validation RMSE (log scale): 0.13629851535671492
Best iteration: 2107


## Retrain on FULL training data using best_iteration

In [8]:
best_iter = model.best_iteration_

final_model = LGBMRegressor(
    n_estimators=best_iter,
    learning_rate=0.01,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    force_row_wise=True,
    random_state=42
)

final_model.fit(X, y)

[LightGBM] [Info] Total Bins 3638
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 202
[LightGBM] [Info] Start training from score 12.024057


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.01
,n_estimators,2107
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


## Predict Kaggle test + build submission

In [12]:
# ---------------------------------------
# FINAL SUBMISSION (CORRECT Id handling)
# ---------------------------------------

import pandas as pd
import numpy as np
from pathlib import Path

REPO_ROOT = Path.cwd().parent

# 1) Load RAW test ONLY to get correct Ids
raw_test = pd.read_csv(REPO_ROOT / "Data" / "Raw" / "test.csv")
ids = raw_test["Id"].astype("int32")

# 2) Sanity check
assert len(ids) == len(test_pred), (len(ids), len(test_pred))
assert ids.is_unique

# 3) Build submission
submission = pd.DataFrame({
    "Id": ids,
    "SalePrice": test_pred
})

# 4) Save
out_path = REPO_ROOT / "Data" / "Processed" / "submission_sa_lgbm_es.csv"
submission.to_csv(out_path, index=False)

print("Saved:", out_path)
print(submission.head())
print(submission.dtypes)
print("Id min/max:", submission["Id"].min(), submission["Id"].max())

Saved: /Users/sanaaarsman/dev/ironhack/projects/ironkaggle/Google-Brain---House-Prices-A.R.T/Data/Processed/submission_sa_lgbm_es.csv
     Id      SalePrice
0  1461  120226.224471
1  1462  155831.323388
2  1463  188111.590872
3  1464  190311.778452
4  1465  189553.245483
Id             int32
SalePrice    float64
dtype: object
Id min/max: 1461 2919
