In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import scipy
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
import optuna
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge, Ridge, OrthogonalMatchingPursuit
from lightgbm import LGBMRegressor
sns.set_style('darkgrid')

In [2]:
# Display options for Pandas DataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 90)

In [3]:
# Load datasets
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
sample_submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

## Combine Both Sets

In [4]:
# Combine train and test datasets for preprocessing
target = train['SalePrice']
test_id = test['Id']
train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)
data1 = pd.concat([train, test], axis=0).reset_index(drop=True)

## Cleaning

## Set proper datatypes for categorical columns

In [5]:
data2 = data1.copy()
data2['MSSubClass'] = data2['MSSubClass'].astype(str)

## Fill missing categorical values

In [6]:
fill_none_columns = [
    'Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
    'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC',
    'Fence', 'MiscFeature'
]
for column in fill_none_columns:
    data2[column] = data2[column].fillna('None')

fill_mode_columns = [
    'MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Electrical',
    'KitchenQual', 'Functional', 'SaleType'
]
for column in fill_mode_columns:
    data2[column] = data2[column].fillna(data2[column].mode()[0])

## Define KNN impute function

In [7]:
def knn_impute(df):
    df = df.copy()
    numeric_df = df.select_dtypes(include=np.number)
    
    imputer = KNNImputer(n_neighbors=5, metric='nan_euclidean')
    imputed_array = imputer.fit_transform(numeric_df)
    
    imputed_df = pd.DataFrame(imputed_array, columns=numeric_df.columns, index=df.index)
    
    df.update(imputed_df)
    return df

## Impute missing numerical values

In [8]:
data3 = data2.copy()

In [9]:
impute_columns = [
    'LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea'
]

In [10]:
data3.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,BrkFace,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,BrkFace,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal


In [11]:
for column in impute_columns:
    data3 = knn_impute(data3)

In [12]:
data3.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual 

## Feature Engineering

In [13]:
data4 = data3.copy()

In [14]:
data4["SqFtPerRoom"] = data4["GrLivArea"] / (data4["TotRmsAbvGrd"] + data4["FullBath"] + data4["HalfBath"] + data4["KitchenAbvGr"])
data4['Total_Home_Quality'] = data4['OverallQual'] + data4['OverallCond']
data4['Total_Bathrooms'] = (data4['FullBath'] + (0.5 * data4['HalfBath']) + data4['BsmtFullBath'] + (0.5 * data4['BsmtHalfBath']))
data4["HighQualSF"] = data4["1stFlrSF"] + data4["2ndFlrSF"]

## Feature Transformation

In [15]:
data5 = data4.copy()

In [16]:
skew_df = pd.DataFrame(data5.select_dtypes(np.number).columns, columns=['Feature'])

In [17]:
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data5[feature]))
skew_df['absskew'] = skew_df['Skew'].apply(abs)
skew_df['skewed'] = skew_df['absskew'].apply(lambda x: True if x >= 0.5 else False)

In [18]:
for column in skew_df[skew_df['skewed'] == True]['Feature']:
    data5[column] = np.log1p(data5[column])

In [19]:
#here is column MoSold as 1 and 12 are close so to show this cyclic struct so cosine trasformation as due to its graph

## Cosine Transformation of Cyclical Feature

In [20]:
data5['MoSold'] = -np.cos(0.5236 * data5['MoSold'])  # 0.5236 = 2π/12

## Encoding Categoricals

In [21]:
data6 = pd.get_dummies(data5)

## Scaling

In [22]:
scaler = StandardScaler()
data6 = pd.DataFrame(scaler.fit_transform(data6), index=data6.index, columns=data6.columns)

## Target Transformation

In [23]:
log_target = np.log(target)

## Split the data

In [24]:
train_final = data6.loc[:train.index.max(), :].copy()
test_final = data6.loc[train.index.max() + 1:, :].reset_index(drop=True).copy()

## Hyperparameter optimization

In [25]:
catboost_params = {
    'iterations': 6000,
    'learning_rate': 0.005,
    'depth': 4,
    'l2_leaf_reg': 1,
    'eval_metric': 'RMSE',
    'early_stopping_rounds': 200,
    'random_seed': 42
}

br_params = {
    'n_iter': 341,
    'tol': 9.199334784903163,
    'alpha_1': 2.8795369369129618,
    'alpha_2': 9.210601769507551,
    'lambda_1': 0.0537800588840187,
    'lambda_2': 3.6159116695244037e-07
}

lightgbm_params = {
    'num_leaves': 39,
    'max_depth': 2,
    'learning_rate': 0.13705339989856127,
    'n_estimators': 273
}

ridge_params = {
    'alpha': 631.1412445239156
}

## Train models

In [26]:
models = {
    'catboost': CatBoostRegressor(**catboost_params, verbose=0),
    'br': BayesianRidge(**br_params),
    'lgbm': LGBMRegressor(**lightgbm_params, verbose=0),
    'ridge': Ridge(**ridge_params),
    'omp': OrthogonalMatchingPursuit()
}

In [27]:
for name, model in models.items():
    model.fit(train_final, log_target)
    print(name + " trained")

catboost trained
br trained
lgbm trained
ridge trained
omp trained


## Evaluate models

In [28]:
results = {}
kf = KFold(n_splits=10, shuffle=True, random_state=1)

In [29]:
for name, model in models.items():
    result = np.exp(np.sqrt(-cross_val_score(model, train_final, log_target, scoring='neg_mean_squared_error', cv=kf, n_jobs=-1)))
    results[name] = result
    print(f"{name} cross-validation results: {result}")

catboost cross-validation results: [1.12193653 1.14327411 1.15157027 1.13220589 1.13815561 1.10185895
 1.15006522 1.12080254 1.10919838 1.09087935]
br cross-validation results: [1.13159781 1.14379986 1.14275513 1.2107558  1.13160502 1.10128999
 1.16800455 1.11919336 1.11165754 1.0926607 ]
lgbm cross-validation results: [1.13946397 1.14580443 1.15942734 1.14559497 1.15683775 1.10774515
 1.14725409 1.13463826 1.12077967 1.10400552]
ridge cross-validation results: [1.13297897 1.14131288 1.15195371 1.19697039 1.12962284 1.10458826
 1.16842791 1.11681218 1.1139516  1.09545029]
omp cross-validation results: [1.12971958 1.15541582 1.13934725 1.22694778 1.13442676 1.11322943
 1.17442737 1.12405681 1.11522501 1.09736912]


In [30]:
for name, result in results.items():
    print("----------------\n" + name + "\n---------------")
    print(np.mean(result))
    print(np.std(result))

----------------
catboost
---------------
1.1259946845901148
0.01964614494087774
----------------
br
---------------
1.1353319766214587
0.03272846003135144
----------------
lgbm
---------------
1.1361551148520166
0.018344349095169316
----------------
ridge
---------------
1.1352069023768088
0.029292476283439863
----------------
omp
---------------
1.1410164946009125
0.03537186779181372


## Combine predictions

In [31]:
final_predictions = (
    0.4 * np.exp(models['catboost'].predict(test_final)) +
    0.2 * np.exp(models['br'].predict(test_final)) +
    0.2 * np.exp(models['lgbm'].predict(test_final)) +
    0.1 * np.exp(models['ridge'].predict(test_final)) +
    0.1 * np.exp(models['omp'].predict(test_final))
)

## Make submission

In [32]:
submission = pd.concat([test_id, pd.Series(final_predictions, name='SalePrice')], axis=1)
submission.to_csv('./submission.csv', index=False, header=True)