# 1 Import Libraries


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# 2 Load the Data

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

print("Train shape", train.shape)
print("Test shape", test.shape)

Train shape (1460, 81)
Test shape (1459, 80)


# 4 Combining the train test for preprocessing

In [3]:
train_ID=train["Id"]
test_ID=test["Id"]

train.drop(["Id"], axis=1, inplace=True)
test.drop(["Id"], axis=1, inplace=True)

n_train=train.shape[0]
combined=pd.concat([train,test],ignore_index=True)

print("Combined shape:", combined.shape)

Combined shape: (2919, 80)


In [4]:
combined.loc[:n_train-1,'SalePrice']=np.log(combined.loc[:n_train-1,'SalePrice'])

In [5]:
missing=combined.isnull().sum().sort_values(ascending=False)
missing = missing[missing>0]
print(missing)

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
MasVnrType      1766
SalePrice       1459
FireplaceQu     1420
LotFrontage      486
GarageYrBlt      159
GarageCond       159
GarageQual       159
GarageFinish     159
GarageType       157
BsmtExposure      82
BsmtCond          82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrArea        23
MSZoning           4
Utilities          2
Functional         2
BsmtFullBath       2
BsmtHalfBath       2
BsmtFinSF2         1
BsmtFinSF1         1
TotalBsmtSF        1
BsmtUnfSF          1
SaleType           1
KitchenQual        1
GarageCars         1
GarageArea         1
Exterior2nd        1
Exterior1st        1
Electrical         1
dtype: int64


In [6]:
none_cols = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'BsmtExposure', 'BsmtCond', 'BsmtQual', 'BsmtFinType1', 'BsmtFinType2',
    'MasVnrType'
]

for col in none_cols:
    combined[col]=combined[col].fillna("None")

zero_cols = [
    'GarageYrBlt', 'GarageCars', 'GarageArea',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    'MasVnrArea'
]

for col in zero_cols:
    combined[col]=combined[col].fillna(0)

combined["LotFrontage"]=combined["LotFrontage"].fillna(combined["LotFrontage"].mean())
numeric_cols = combined.select_dtypes(include=['float64','int64']).columns
for col in numeric_cols:
    if combined[col].isnull().sum() > 0:
        combined[col] = combined[col].fillna(combined[col].median())
cat_cols = combined.select_dtypes(include=['object']).columns
for col in cat_cols:
    if combined[col].isnull().sum() > 0:
        combined[col] = combined[col].fillna(combined[col].mode()[0])


# 5 Feature Engineering

In [7]:
combined['MSSubClass']=combined["MSSubClass"].astype("str")

In [8]:
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']
combined['HouseAge'] = combined['YrSold'] - combined['YearBuilt']
combined['RemodAge'] = combined['YrSold'] - combined['YearRemodAdd']

In [9]:
numeric_feats = combined.dtypes[combined.dtypes != 'object'].index
skewed_feats = combined[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75]
for feat in skewed_feats.index:
    if feat != 'SalePrice':
        combined[feat] = np.log1p(combined[feat])

In [10]:
combined = pd.get_dummies(combined, drop_first=True)
print("Shape after one-hot encoding:", combined.shape)

Shape after one-hot encoding: (2919, 277)


In [11]:
train_prepared = combined.iloc[:n_train, :]
test_prepared = combined.iloc[n_train:, :]

y = train_prepared['SalePrice'].values
train_prepared.drop('SalePrice', axis=1, inplace=True)
test_prepared.drop('SalePrice', axis=1, inplace=True)

X = train_prepared.values
X_test_final = test_prepared.values

print("Training set shape:", X.shape)
print("Test set shape:", X_test_final.shape)

Training set shape: (1460, 276)
Test set shape: (1459, 276)


In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6 Choosing the model

## Lasso, Ridge

In [13]:
lasso_model=Lasso(alpha=0.1,random_state=42)
lasso_model.fit(X_train, y_train)

y_val_pred=lasso_model.predict(X_val)

rmse=np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Lasso rmse",rmse)

Lasso rmse 0.245047323180698


## Random Forest

In [14]:
rf=RandomForestRegressor(n_estimators=100,random_state=42)
rf.fit(X_train, y_train)

y_val_pred_rf=rf.predict(X_val)
rmse_rf=np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
print("Random Forest RMSE",rmse_rf)

Random Forest RMSE 0.1488072719853375


## XGBoost

In [15]:
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    eval_metric='rmse',
    early_stopping_rounds=50
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

y_val_pred_xgb=xgb_model.predict(X_val)
rmse_xgb=np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
print("XGB RMSE",rmse_xgb)

XGB RMSE 0.14064361634433253


## LightGBM

In [16]:
lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05,
                              max_depth=5, random_state=42,early_stopping_rounds=50,eval_metric='rmse',verbose=1)
lgb_model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)])

y_val_pred_lgb = lgb_model.predict(X_val)
rmse_lgb = np.sqrt(mean_squared_error(y_val, y_val_pred_lgb))
print("LightGBM RMSE:", rmse_lgb)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000951 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3592
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 180
[LightGBM] [Info] Start training from score 12.030652
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[239]	valid_0's l2: 0.0193502
LightGBM RMSE: 0.13909993678105068


# 7 Cross-Validation

In [17]:
def rmse_cv(model, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf))
    return rmse

model_to_cv = Lasso(alpha=0.01, random_state=42)
cv_scores = rmse_cv(model_to_cv, X, y)
print("Lasso CV RMSE: {:.4f} ± {:.4f}".format(cv_scores.mean(), cv_scores.std()))

Lasso CV RMSE: 0.1504 ± 0.0136


In [18]:
final_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05,
                                max_depth=5, random_state=42)
final_model.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3866
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 188
[LightGBM] [Info] Start training from score 12.024051


# 8 Predict On test set

In [19]:
test_preds = final_model.predict(X_test_final)
test_preds = np.expm1(test_preds)

In [20]:
submission = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': test_preds
})
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")

Submission file created: submission.csv
