In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingRegressor

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax


In [5]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [3]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [6]:
#Load the uploaded CSVs
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Save IDs
train_ID = train['Id']
test_ID = test['Id']

# Drop ID
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

# Log-transform target
train["SalePrice"] = np.log1p(train["SalePrice"])
y = train["SalePrice"].values

# Merge for preprocessing
df = pd.concat([train.drop("SalePrice", axis=1), test], axis=0)


In [7]:
# Handle missing values
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna("None")
    else:
        df[col] = df[col].fillna(df[col].median())


In [8]:
# Label encode categorical features
cols = df.select_dtypes(include=["object"]).columns
for col in cols:
    lbl = LabelEncoder()
    df[col] = lbl.fit_transform(df[col].astype(str))


In [15]:
# Fix skewed numeric features
numeric_feats = df.dtypes[df.dtypes != "object"].index
skewed_feats = df[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
skewness = skewed_feats[abs(skewed_feats) > 0.75]

# Filter out columns with only one unique value
skewness = skewness[skewness.index.isin(df.columns[df.nunique() > 1])]

for feat in skewness.index:
    df[feat] = np.log1p(df[feat])

In [16]:
# Split back to train and test
ntrain = train.shape[0]
X = df[:ntrain]
X_test = df[ntrain:]


In [17]:
#  Define models
ridge = make_pipeline(RobustScaler(), Ridge(alpha=12))
lgb_model = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720)
xgb_model = xgb.XGBRegressor(learning_rate=0.05, n_estimators=700, max_depth=3, objective='reg:squarederror')
cat_model = CatBoostRegressor(iterations=700, learning_rate=0.05, depth=3, verbose=0)

# Stacking
stacked_model = StackingRegressor(
    estimators=[
        ('ridge', ridge),
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cat', cat_model),
    ],
    final_estimator=Ridge(alpha=10)
)


In [18]:
# Cross-validation
def rmsle_cv(model):
    kf = KFold(5, shuffle=True, random_state=42)
    rmse = -cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=kf)
    return rmse.mean()

print("✅ RMSLE score (Stacked):", rmsle_cv(stacked_model))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3126
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 73
[LightGBM] [Info] Start training from score 12.030658
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2905
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 71
[LightGBM] [Info] Start training from score 12.028105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough

In [19]:
# Train and Predict
stacked_model.fit(X, y)
stacked_preds = np.expm1(stacked_model.predict(X_test))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3387
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 74
[LightGBM] [Info] Start training from score 12.024057
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000695 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3105
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 72
[LightGBM] [Info] Start training from score 12.021409
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [20]:
# Make submission file
submission = pd.DataFrame()
submission['Id'] = test_ID
submission['SalePrice'] = stacked_preds
submission.to_csv('submission.csv', index=False)
print("✅ submission.csv created!")


✅ submission.csv created!


In [21]:
# Download the submission file
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>