In [None]:
# ============================================
# üè° HOUSE PRICES - ADVANCED REGRESSION MODEL
# Optimized for Kaggle High Score (0.12‚Äì0.13 RMSE)
# ============================================

# STEP 0: Install & Import Libraries
!pip install numpy pandas matplotlib seaborn scikit-learn xgboost lightgbm

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
import zipfile, io, warnings
warnings.filterwarnings('ignore')

# STEP 1: Upload ZIP
uploaded = files.upload()

zip_filename = list(uploaded.keys())[0]
with zipfile.ZipFile(io.BytesIO(uploaded[zip_filename]), 'r') as zip_ref:
    zip_ref.extractall('house_data')

print(f"‚úÖ Extracted {zip_filename} to 'house_data/' folder")

# STEP 2: Load CSV Files
train = pd.read_csv('house_data/train.csv')
test = pd.read_csv('house_data/test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

# STEP 3: Target Transformation (Log Transform)
y = np.log1p(train['SalePrice'])

# Drop target from training data
train.drop(['SalePrice'], axis=1, inplace=True)

# Combine train and test for joint preprocessing
full_data = pd.concat([train, test], axis=0, ignore_index=True)

# STEP 4: Handle Missing Values
for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    full_data[col] = full_data[col].fillna('None')

# Fill categorical with mode, numerical with median
for col in full_data.columns:
    if full_data[col].dtype == "object":
        full_data[col] = full_data[col].fillna(full_data[col].mode()[0])
    else:
        full_data[col] = full_data[col].fillna(full_data[col].median())

# STEP 5: Feature Engineering
full_data['TotalSF'] = full_data['TotalBsmtSF'] + full_data['1stFlrSF'] + full_data['2ndFlrSF']
full_data['TotalBathrooms'] = (full_data['FullBath'] + (0.5 * full_data['HalfBath']) +
                               full_data['BsmtFullBath'] + (0.5 * full_data['BsmtHalfBath']))
full_data['TotalPorchSF'] = (full_data['OpenPorchSF'] + full_data['3SsnPorch'] +
                             full_data['EnclosedPorch'] + full_data['ScreenPorch'] + full_data['WoodDeckSF'])
full_data['HouseAge'] = full_data['YrSold'] - full_data['YearBuilt']
full_data['RemodAge'] = full_data['YrSold'] - full_data['YearRemodAdd']
full_data['HasPool'] = (full_data['PoolArea'] > 0).astype(int)
full_data['HasGarage'] = (full_data['GarageArea'] > 0).astype(int)
full_data['HasBsmt'] = (full_data['TotalBsmtSF'] > 0).astype(int)
full_data['HasFireplace'] = (full_data['Fireplaces'] > 0).astype(int)

# STEP 6: Encode Categorical Variables
full_data = pd.get_dummies(full_data, drop_first=True)

# STEP 7: Split Back to Train/Test
X = full_data.iloc[:len(y), :]
X_test = full_data.iloc[len(y):, :]

print(f"‚úÖ Final Train shape: {X.shape}, Test shape: {X_test.shape}")

# STEP 8: Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# STEP 9: Train Multiple Models
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import cross_val_score

def rmsle_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_scaled, y, scoring="neg_mean_squared_error", cv=5))
    return rmse.mean()

models = {
    "Ridge": Ridge(alpha=10),
    "Lasso": Lasso(alpha=0.001),
    "XGBoost": xgb.XGBRegressor(
        n_estimators=2000, learning_rate=0.05, max_depth=4,
        subsample=0.8, colsample_bytree=0.8, random_state=42
    ),
    "LightGBM": lgb.LGBMRegressor(
        n_estimators=2000, learning_rate=0.05, max_depth=-1,
        num_leaves=8, subsample=0.8, colsample_bytree=0.8, random_state=42
    )
}

scores = {}
for name, model in models.items():
    score = rmsle_cv(model)
    scores[name] = score
    print(f"{name} CV RMSLE: {score:.5f}")

best_model = min(scores, key=scores.get)
print(f"\nüèÜ Best Base Model: {best_model}")

# STEP 10: Fit Best Models & Create Stacking Ensemble
ridge = Ridge(alpha=10)
lasso = Lasso(alpha=0.001)
xgb_model = models['XGBoost']
lgb_model = models['LightGBM']

from sklearn.ensemble import StackingRegressor
stacked_model = StackingRegressor(
    estimators=[
        ('ridge', ridge),
        ('lasso', lasso),
        ('xgb', xgb_model),
        ('lgb', lgb_model)
    ],
    final_estimator=Ridge(alpha=1)
)
stacked_model.fit(X_scaled, y)
print("‚úÖ Stacking model trained successfully")

# STEP 11: Generate Predictions
stacked_preds = np.expm1(stacked_model.predict(X_test_scaled))

# STEP 12: Create Submission
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': stacked_preds
})
submission.to_csv('house_prices_highscore.csv', index=False)
print("‚úÖ Submission file created: house_prices_highscore.csv")

# STEP 13: Download Submission
from google.colab import files
files.download('house_prices_highscore.csv')




Saving house-prices-advanced-regression-techniques.zip to house-prices-advanced-regression-techniques (1).zip
‚úÖ Extracted house-prices-advanced-regression-techniques (1).zip to 'house_data/' folder
Train shape: (1460, 81), Test shape: (1459, 80)
‚úÖ Final Train shape: (1460, 259), Test shape: (1459, 259)
Ridge CV RMSLE: 0.15778
Lasso CV RMSLE: 0.16081
XGBoost CV RMSLE: 0.12369
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4207
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 171
[LightGBM] [Info] Start training from score 12.021409
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_c

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>