In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import joblib

In [2]:
SEED = 42
np.random.seed(SEED)

In [3]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
sample = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [4]:
y = np.log1p(train['SalePrice'])
train.drop(['SalePrice'], axis=1, inplace=True)

In [6]:
data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)

# Handle missing values
auto_num = data.select_dtypes(include=[np.number]).columns
auto_cat = data.select_dtypes(include=['object']).columns

# Impute numeric with median, categorical with 'Missing'
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # fill NA with median
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('label_enc', OrdinalEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, auto_num),
    ('cat', categorical_transformer, auto_cat)
])

# transform all features
X_all = preprocessor.fit_transform(data)




In [9]:
# model 
n_train = train.shape[0]
X = X_all[:n_train]
X_test = X_all[n_train:]

def rmse_cv(model, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf))
    return rmse

# lightGBM
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    n_estimators=10000,
    learning_rate=0.01,
    num_leaves=31,
    colsample_bytree=0.7,
    subsample=0.7,
    random_state=SEED
)
print('LightGBM CV:', rmse_cv(lgb_model, X, y).mean())


# Fit on full data
lgb_model.fit(X, y)
joblib.dump(lgb_model, 'lgb_model.pkl')


# xgboost
xgb_model = xgb.XGBRegressor(
    n_estimators=5000,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=SEED,
    n_jobs=-1
)

print('XGBoost CV:', rmse_cv(xgb_model, X, y).mean())


xgb_model.fit(X, y)
joblib.dump(xgb_model, 'xgb_model.pkl')


    
# Weighted average: 0.5*LGB + 0.5*XGB
preds_lgb = lgb_model.predict(X_test)
preds_xgb = xgb_model.predict(X_test)
preds = 0.5 * preds_lgb + 0.5 * preds_xgb

final_preds = np.expm1(preds)
    


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3396
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 74
[LightGBM] [Info] Start training from score 12.030658
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3393
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 75
[LightGBM] [Info] Start training from score 12.016898
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi

In [11]:
submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': final_preds})
submission.to_csv('submission.csv', index=False)