In [47]:
# ===============================
# 1️⃣ Imports
# ===============================
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import skew

# ===============================
# 2️⃣ Load Data
# ===============================
train = pd.read_csv("/content/train[1].csv")
test = pd.read_csv("/content/test[1].csv")

train_ID = train['Id']
test_ID = test['Id']

train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

# Log-transform target safely
train["SalePrice"] = np.log1p(train["SalePrice"])
y_train = train["SalePrice"]

# Combine train/test
all_data = pd.concat([train.drop("SalePrice", axis=1), test], axis=0, ignore_index=True)
print(f"Combined dataset shape: {all_data.shape}")

# ===============================
# 3️⃣ Handle Missing Values
# ===============================
cat_na_cols = ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','GarageType',
               'GarageFinish','GarageQual','GarageCond','BsmtQual','BsmtCond',
               'BsmtExposure','BsmtFinType1','BsmtFinType2','MasVnrType']
for col in cat_na_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna('None')

num_na_cols = ['GarageYrBlt','GarageArea','GarageCars','BsmtFinSF1','BsmtFinSF2',
               'BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','MasVnrArea']
for col in num_na_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna(0)

if 'LotFrontage' in all_data.columns:
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# Remaining numeric -> median
num_feats = all_data.select_dtypes(exclude="object").columns
for col in num_feats:
    all_data[col] = all_data[col].fillna(all_data[col].median())

# Remaining categorical -> mode
cat_feats = all_data.select_dtypes(include="object").columns
for col in cat_feats:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# ===============================
# 4️⃣ Feature Engineering
# ===============================
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['GarageAge'] = all_data['YrSold'] - all_data['GarageYrBlt']
all_data['OverallScore'] = all_data['OverallQual'] * all_data['OverallCond']
all_data['TotalBath'] = (all_data['FullBath'] + 0.5*all_data['HalfBath'] +
                         all_data['BsmtFullBath'] + 0.5*all_data['BsmtHalfBath'])
all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                            all_data['EnclosedPorch'] + all_data['ScreenPorch'] + all_data['WoodDeckSF'])
all_data['HasPool'] = (all_data['PoolArea'] > 0).astype(int)
all_data['HasGarage'] = (all_data['GarageArea'] > 0).astype(int)
all_data['HasBsmt'] = (all_data['TotalBsmtSF'] > 0).astype(int)
all_data['HasFireplace'] = (all_data['Fireplaces'] > 0).astype(int)

# ===============================
# 5️⃣ Encode Categorical Variables
# ===============================
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                'HeatingQC', 'KitchenQual', 'FireplaceQu',
                'GarageQual', 'GarageCond']
qual_map = {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
for col in ordinal_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(qual_map)

# One-hot encode remaining categoricals
all_data = pd.get_dummies(all_data, drop_first=True)

# ===============================
# 6️⃣ Clip extreme outliers
# ===============================
numeric_feats = all_data.select_dtypes(include=[np.number]).columns
for col in numeric_feats:
    upper = all_data[col].quantile(0.999)
    lower = all_data[col].quantile(0.001)
    all_data[col] = all_data[col].clip(lower, upper)

# ===============================
# 7️⃣ Safe log1p for positive features
# ===============================
for feat in numeric_feats:
    if (all_data[feat] > 0).all():
        all_data[feat] = np.log1p(all_data[feat])

# Replace any remaining inf/-inf with 0
all_data.replace([np.inf, -np.inf], 0, inplace=True)

# ===============================
# 8️⃣ Split Back Train/Test
# ===============================
X_train = all_data[:len(y_train)]
X_test = all_data[len(y_train):]

# ===============================
# 9️⃣ Ensure no NaNs
# ===============================
imputer = SimpleImputer(strategy='median')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# ===============================
# 🔟 Stacking Helper
# ===============================
def stacking(models, meta_model, X_train, y_train, X_test, n_folds=5):
    folds = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    S_train = np.zeros((X_train.shape[0], len(models)))
    S_test = np.zeros((X_test.shape[0], len(models)))

    for i, model in enumerate(models):
        S_test_i = np.zeros((X_test.shape[0], n_folds))
        for j, (train_idx, valid_idx) in enumerate(folds.split(X_train)):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

            instance = clone(model)
            instance.fit(X_tr, y_tr)

            S_train[valid_idx, i] = instance.predict(X_val)
            S_test_i[:, j] = instance.predict(X_test)

        S_test[:, i] = S_test_i.mean(axis=1)

    meta_model.fit(S_train, y_train)
    y_pred = meta_model.predict(S_test)
    return y_pred

# ===============================
# 1️⃣1️⃣ Define Models (Tuned)
# ===============================
ridge = Ridge(alpha=50, solver='auto', random_state=42)
lasso = Lasso(alpha=0.0005, max_iter=5000, random_state=42)
xgb_model = xgb.XGBRegressor(
    n_estimators=1000, learning_rate=0.05, max_depth=4,
    subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror',
    random_state=42, n_jobs=-1
)
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000, learning_rate=0.05, num_leaves=50,
    colsample_bytree=0.8, subsample=0.8, reg_alpha=0.1, reg_lambda=0.1,
    random_state=42
)
meta_ridge = Ridge(alpha=10, random_state=42)
models = [ridge, lasso, xgb_model, lgb_model]

# ===============================
# 1️⃣2️⃣ Run Stacking & Predict
# ===============================
stacked_preds = stacking(models, meta_ridge, X_train, y_train, X_test, n_folds=5)

# ===============================
# 1️⃣3️⃣ Create Submission
# ===============================
submission = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': np.expm1(stacked_preds)
})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file created: submission.csv")


Combined dataset shape: (2919, 79)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3963
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 164
[LightGBM] [Info] Start training from score 12.016898
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3947
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 160
[LightGBM] [Info] Start training from score 12.022759
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001198 seconds.
You can set `forc