# House Prices Advanced Regression
**Importing Libraries**

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor

**Reading Files**

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


*Store the original test ID for reference later*

In [7]:
test_ids = test.index

**Checking for nan values and analysing it.**

*Concatenate the train and test datasets*

*Create a new column 'Source' to distinguish between train and test data*

In [8]:
train['Source'] = 1
test['Source'] = 0

*Concatenate train and test data*

In [9]:
combined = pd.concat([train, test], axis=0, ignore_index=True)

*Check for Null Values and analyze it in combined data*

In [10]:
null_values = combined.isnull().sum()
print("Null Values in each column:\n", null_values[null_values > 0])
print("\nTotal Null Values: ", null_values.sum())

Null Values in each column:
 MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType      1766
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
SalePrice       1459
dtype: int64

Total Null Values:  17166


*Separating numeric and categorical data columns*

In [11]:
numeric_cols = combined.select_dtypes(include=['number']).columns
categorical_cols = combined.select_dtypes(include=['object']).columns

*Fill missing values for numeric columns with the median*

In [12]:
combined[numeric_cols] = combined[numeric_cols].fillna(combined[numeric_cols].median())

*Fill missing values for categorical columns with the mode (most frequent value)*

In [13]:
for col in categorical_cols:
    mode_val = combined[col].mode()[0]  # Get the most frequent value
    combined[col].fillna(mode_val, inplace=True)

*Drop features with more than 500 null values (this should now be zero since missing values are filled)*

In [14]:
null_values = combined.isnull().sum()
features_to_drop = null_values[null_values > 500].index
print(f"\nDropping features with more than 500 null values: {features_to_drop.tolist()}")
combined.drop(columns=features_to_drop, inplace=True)


Dropping features with more than 500 null values: []


*Update categorical columns after dropping features*

In [15]:
categorical_cols = combined.select_dtypes(include=['object']).columns

**Apply one-hot encoding to categorical variables**

*Apply one-hot encoding to categorical variables*

In [16]:
combined = pd.get_dummies(combined, columns=categorical_cols, drop_first=True)
print("Data after one-hot encoding:\n", combined.head())

Data after one-hot encoding:
    Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0   1          60         65.0     8450            7            5       2003   
1   2          20         80.0     9600            6            8       1976   
2   3          60         68.0    11250            7            5       2001   
3   4          70         60.0     9550            7            5       1915   
4   5          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  SaleType_ConLI  SaleType_ConLw  \
0          2003       196.0       706.0  ...           False           False   
1          1976         0.0       978.0  ...           False           False   
2          2002       162.0       486.0  ...           False           False   
3          1970         0.0       216.0  ...           False           False   
4          2000       350.0       655.0  ...           False           False   

   SaleT

**Split combined data back into train and test sets**

In [17]:
train_data = combined[combined['Source'] == 1].drop('Source', axis=1)
test_data = combined[combined['Source'] == 0].drop('Source', axis=1)

*Ensure the columns match exactly between train and test data*

In [18]:
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

**Split the training data into train and validation sets**

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**Define base models and stacking model**

In [20]:
estimators = [
    ('xgb', XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=10, random_state=42)),
    ('lgbm', lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=31, random_state=42)),
    ('catboost', CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=10, random_seed=42, verbose=0))
]

*Initialize the stacking model with Ridge as the final estimator*

In [21]:
stacking_model = StackingRegressor(estimators=estimators, final_estimator=Ridge())

*Train the stacking model*

In [22]:
stacking_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000880 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3395
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 155
[LightGBM] [Info] Start training from score 181441.541952
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000770 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3175
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 147
[LightGBM] [Info] Start training from score 181121.274090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is 

**Predict on the test data**

In [23]:
y_val_pred = stacking_model.predict(X_val)

*Calculate the logarithm of the predictions and the actual values*

In [24]:
y_val_log_pred = np.log1p(y_val_pred)
y_val_log_actual = np.log1p(y_val)

**Calculate Logarithmic RMSE and R² score**

In [25]:
log_rmse = np.sqrt(mean_squared_error(y_val_log_actual, y_val_log_pred))
r2 = r2_score(y_val_log_actual, y_val_log_pred)
print(f"\nValidation Logarithmic RMSE: {log_rmse}")
print(f"Validation R² Score: {r2}")


Validation Logarithmic RMSE: 0.1457278547491009
Validation R² Score: 0.8861985148337437


**Predict on the test set using the entire training data**

In [26]:
stacking_model.fit(X, y)  # Re-train on the entire dataset before predicting on the test set
y_test_pred = stacking_model.predict(test_data[X_train.columns])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3662
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 162
[LightGBM] [Info] Start training from score 180921.195890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000801 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3380
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 158
[LightGBM] [Info] Start training from score 180717.091610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is

*Handle any extremely large or small predictions*

In [27]:
y_test_pred = np.clip(y_test_pred, 0, np.percentile(y_test_pred, 99))

**Save the predictions to a CSV file**

In [28]:
output = pd.DataFrame({'Id': test_ids, 'SalePrice': y_test_pred})
output.to_csv('test_predictions.csv', index=False)
print("\nPredictions saved to test_predictions.csv")


Predictions saved to test_predictions.csv
