In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
warnings.filterwarnings('ignore')

from japanmap import picture

In [2]:
prefecture_code = pd.read_csv("japan_housing_data/prefecture_code.csv")
prefecture_code.head()

Unnamed: 0,Code,JpName,EnName
0,1,北海道,Hokkaido
1,2,青森県,Aomori
2,3,岩手県,Iwate
3,4,宮城県,Miyagi
4,5,秋田県,Akita


In [3]:
df = pd.DataFrame()

data_dir = "japan_housing_data/trade_prices"

for f in os.listdir(data_dir):
    df_prefecture = pd.read_csv(os.path.join(data_dir, f), low_memory=False)
    df_prefecture["Code"] = int(f.split(".")[0])
    df = pd.concat([df, df_prefecture])

df.head()

Unnamed: 0,No,Type,Region,MunicipalityCode,Prefecture,Municipality,DistrictName,NearestStation,TimeToNearestStation,MinTimeToNearestStation,...,Breadth,CityPlanning,CoverageRatio,FloorAreaRatio,Period,Year,Quarter,Renovation,Remarks,Code
0,1,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Aobadai,Moji,30-60minutes,30.0,...,4.0,Category I Exclusively Low-story Residential Zone,50.0,80.0,4th quarter 2017,2017,4,,,40
1,2,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Oaza Ikawa,Komorie,30-60minutes,30.0,...,4.0,Urbanization Control Area,,,1st quarter 2018,2018,1,,,40
2,3,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Oaza Ikawa,Moji,1H-1H30,60.0,...,4.0,Urbanization Control Area,,,4th quarter 2017,2017,4,,,40
3,4,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Oaza Ikawa,Moji,1H-1H30,60.0,...,35.0,Urbanization Control Area,70.0,200.0,4th quarter 2016,2016,4,,,40
4,5,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Oaza Ikawa,Moji,1H-1H30,60.0,...,,Urbanization Control Area,70.0,200.0,3rd quarter 2016,2016,3,,,40


In [4]:
df = pd.merge(left=df, right=prefecture_code, left_on="Code", right_on="Code", how="left")
df.head()

Unnamed: 0,No,Type,Region,MunicipalityCode,Prefecture,Municipality,DistrictName,NearestStation,TimeToNearestStation,MinTimeToNearestStation,...,CoverageRatio,FloorAreaRatio,Period,Year,Quarter,Renovation,Remarks,Code,JpName,EnName
0,1,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Aobadai,Moji,30-60minutes,30.0,...,50.0,80.0,4th quarter 2017,2017,4,,,40,福岡県,Fukuoka
1,2,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Oaza Ikawa,Komorie,30-60minutes,30.0,...,,,1st quarter 2018,2018,1,,,40,福岡県,Fukuoka
2,3,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Oaza Ikawa,Moji,1H-1H30,60.0,...,,,4th quarter 2017,2017,4,,,40,福岡県,Fukuoka
3,4,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Oaza Ikawa,Moji,1H-1H30,60.0,...,70.0,200.0,4th quarter 2016,2016,4,,,40,福岡県,Fukuoka
4,5,Residential Land(Land Only),Residential Area,40101,Fukuoka Prefecture,"Moji Ward,Kitakyushu City",Oaza Ikawa,Moji,1H-1H30,60.0,...,70.0,200.0,3rd quarter 2016,2016,3,,,40,福岡県,Fukuoka


In [5]:
df.describe().apply(lambda s: s.apply('{0:.1f}'.format))  

Unnamed: 0,No,MunicipalityCode,MinTimeToNearestStation,MaxTimeToNearestStation,TradePrice,Area,AreaIsGreaterFlag,UnitPrice,PricePerTsubo,Frontage,TotalFloorArea,TotalFloorAreaIsGreaterFlag,BuildingYear,PrewarBuilding,Breadth,CoverageRatio,FloorAreaRatio,Year,Quarter,Code
count,3906518.0,3906518.0,3315977.0,3228071.0,3906518.0,3906518.0,3906518.0,1384415.0,1384415.0,2537063.0,1317660.0,3906518.0,1869804.0,3906518.0,2685837.0,3191877.0,3191877.0,3906518.0,3906518.0,3906518.0
mean,78354.1,20467.1,22.6,27.4,27605894.6,467.4,0.0,85148.9,281406.5,14.6,175.2,0.0,1995.5,0.0,6.8,61.0,209.0,2012.9,2.5,20.3
std,80173.6,12175.5,23.1,27.7,141745961.4,847.7,0.2,168683.2,557031.6,9.8,256.4,0.1,15.2,0.0,4.6,10.0,107.5,3.7,1.1,12.2
min,1.0,1101.0,0.0,0.0,100.0,10.0,0.0,1.0,2.0,0.1,10.0,0.0,1945.0,0.0,1.0,30.0,50.0,2005.0,1.0,1.0
25%,20780.0,12204.0,9.0,8.0,5000000.0,100.0,0.0,16000.0,55000.0,8.5,95.0,0.0,1984.0,0.0,4.0,60.0,200.0,2010.0,2.0,12.0
50%,48159.0,17210.0,16.0,15.0,14000000.0,185.0,0.0,40000.0,130000.0,12.0,105.0,0.0,1996.0,0.0,6.0,60.0,200.0,2013.0,3.0,17.0
75%,114733.0,28110.0,30.0,29.0,29000000.0,370.0,0.0,91000.0,300000.0,17.0,140.0,0.0,2009.0,0.0,7.0,60.0,200.0,2016.0,4.0,28.0
max,406575.0,47382.0,120.0,120.0,61000000000.0,5000.0,1.0,20000000.0,66000000.0,50.0,2000.0,1.0,2020.0,1.0,99.9,80.0,1300.0,2019.0,4.0,47.0


In [8]:
# df['UnitPrice'] = df['UnitPrice'].fillna(df['TradePrice'] / df['Area'])

In [9]:
# df['EnName'] = df['EnName'].apply(lambda x: x.rstrip())

In [10]:
missing_percentage = (df.isnull().mean() * 100).sort_values(ascending=False)
print("Missing percentage\n")
print(missing_percentage)

Missing percentage

TotalFloorArea             66.270218
BuildingYear               52.136301
Structure                  51.041490
Classification             30.514719
Direction                  28.901646
FloorAreaRatio             18.293555
MaxTimeToNearestStation    17.367052
MinTimeToNearestStation    15.116812
CityPlanning               13.584118
Type                        0.000000
TradePrice                  0.000000
Area                        0.000000
Year                        0.000000
Quarter                     0.000000
Code                        0.000000
dtype: float64


In [11]:
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col].fillna('unkown', inplace=True)

In [12]:
missing_data = df.isnull().sum().sort_values(ascending=False)
missing_data

TotalFloorArea             2588858
BuildingYear               2036714
FloorAreaRatio              714641
MaxTimeToNearestStation     678447
MinTimeToNearestStation     590541
Type                             0
TradePrice                       0
Area                             0
Structure                        0
Direction                        0
Classification                   0
CityPlanning                     0
Year                             0
Quarter                          0
Code                             0
dtype: int64

In [13]:
missing_percentage = (df.isnull().mean() * 100).sort_values(ascending=False)
print("Missing percentage\n")
print(missing_percentage)

Missing percentage

TotalFloorArea             66.270218
BuildingYear               52.136301
FloorAreaRatio             18.293555
MaxTimeToNearestStation    17.367052
MinTimeToNearestStation    15.116812
Type                        0.000000
TradePrice                  0.000000
Area                        0.000000
Structure                   0.000000
Direction                   0.000000
Classification              0.000000
CityPlanning                0.000000
Year                        0.000000
Quarter                     0.000000
Code                        0.000000
dtype: float64


In [14]:
df_copy = df.copy()
df_copy.head()

Unnamed: 0,Type,MinTimeToNearestStation,MaxTimeToNearestStation,TradePrice,Area,TotalFloorArea,BuildingYear,Structure,Direction,Classification,CityPlanning,FloorAreaRatio,Year,Quarter,Code
0,Residential Land(Land Only),30.0,60.0,2500000,1200,,,unkown,Southwest,Road,Category I Exclusively Low-story Residential Zone,80.0,2017,4,40
1,Residential Land(Land Only),30.0,60.0,11000000,1900,,,unkown,Northwest,City Road,Urbanization Control Area,,2018,1,40
2,Residential Land(Land Only),60.0,90.0,3000000,210,,,unkown,East,City Road,Urbanization Control Area,,2017,4,40
3,Residential Land(Land Only),60.0,90.0,890000,125,,,unkown,East,Prefectural Road,Urbanization Control Area,200.0,2016,4,40
4,Residential Land(Land Only),60.0,90.0,340000,30,,,unkown,No facing road,unkown,Urbanization Control Area,200.0,2016,3,40


In [15]:
# test_columns = ['MaxTimeToNearestStation', 'MinTimeToNearestStation',
#                'Type', 'TradePrice', 'Area', 'Structure', 'Direction',
#                'Classification', 'CityPlanning', 'Year', 'Quarter', 'Code']

In [16]:
# df_copy = df_copy[test_columns]
# df_copy.head()

In [17]:
df_copy.head()

Unnamed: 0,Type,MinTimeToNearestStation,MaxTimeToNearestStation,TradePrice,Area,TotalFloorArea,BuildingYear,Structure,Direction,Classification,CityPlanning,FloorAreaRatio,Year,Quarter,Code
0,Residential Land(Land Only),30.0,60.0,2500000,1200,,,unkown,Southwest,Road,Category I Exclusively Low-story Residential Zone,80.0,2017,4,40
1,Residential Land(Land Only),30.0,60.0,11000000,1900,,,unkown,Northwest,City Road,Urbanization Control Area,,2018,1,40
2,Residential Land(Land Only),60.0,90.0,3000000,210,,,unkown,East,City Road,Urbanization Control Area,,2017,4,40
3,Residential Land(Land Only),60.0,90.0,890000,125,,,unkown,East,Prefectural Road,Urbanization Control Area,200.0,2016,4,40
4,Residential Land(Land Only),60.0,90.0,340000,30,,,unkown,No facing road,unkown,Urbanization Control Area,200.0,2016,3,40


In [18]:
critical_columns = ["Type","MinTimeToNearestStation", 
    "MaxTimeToNearestStation", 
    "TotalFloorArea", "BuildingYear",
    "FloorAreaRatio"]
df_cleaned = df_copy.dropna(subset=critical_columns)

# df_cleaned = df_copy.dropna()
print(df_copy.shape)
print(df_cleaned.shape)

(3906518, 15)
(1170634, 15)


In [19]:
df_cleaned.head()

Unnamed: 0,Type,MinTimeToNearestStation,MaxTimeToNearestStation,TradePrice,Area,TotalFloorArea,BuildingYear,Structure,Direction,Classification,CityPlanning,FloorAreaRatio,Year,Quarter,Code
8,Residential Land(Land and Building),20.0,20.0,110000000,680,1300.0,1987.0,RC,Northeast,City Road,Category I Residential Zone,200.0,2019,2,40
10,Residential Land(Land and Building),24.0,24.0,11000000,165,100.0,1985.0,W,Northwest,City Road,Category I Residential Zone,200.0,2015,2,40
14,Residential Land(Land and Building),26.0,26.0,42000000,220,95.0,2019.0,W,South,City Road,Category I Exclusively Medium-high Residential...,200.0,2019,1,40
17,Residential Land(Land and Building),24.0,24.0,29000000,130,100.0,2017.0,W,Northwest,City Road,Category II Exclusively Medium-high Residentia...,200.0,2017,4,40
18,Residential Land(Land and Building),24.0,24.0,31000000,135,100.0,2017.0,W,Southwest,Private Road,Category I Exclusively Medium-high Residential...,200.0,2017,4,40


In [20]:
missing_data = df_cleaned.isnull().sum().sort_values(ascending=False)
missing_data

Type                       0
MinTimeToNearestStation    0
MaxTimeToNearestStation    0
TradePrice                 0
Area                       0
TotalFloorArea             0
BuildingYear               0
Structure                  0
Direction                  0
Classification             0
CityPlanning               0
FloorAreaRatio             0
Year                       0
Quarter                    0
Code                       0
dtype: int64

In [21]:
df_cleaned.head()

Unnamed: 0,Type,MinTimeToNearestStation,MaxTimeToNearestStation,TradePrice,Area,TotalFloorArea,BuildingYear,Structure,Direction,Classification,CityPlanning,FloorAreaRatio,Year,Quarter,Code
8,Residential Land(Land and Building),20.0,20.0,110000000,680,1300.0,1987.0,RC,Northeast,City Road,Category I Residential Zone,200.0,2019,2,40
10,Residential Land(Land and Building),24.0,24.0,11000000,165,100.0,1985.0,W,Northwest,City Road,Category I Residential Zone,200.0,2015,2,40
14,Residential Land(Land and Building),26.0,26.0,42000000,220,95.0,2019.0,W,South,City Road,Category I Exclusively Medium-high Residential...,200.0,2019,1,40
17,Residential Land(Land and Building),24.0,24.0,29000000,130,100.0,2017.0,W,Northwest,City Road,Category II Exclusively Medium-high Residentia...,200.0,2017,4,40
18,Residential Land(Land and Building),24.0,24.0,31000000,135,100.0,2017.0,W,Southwest,Private Road,Category I Exclusively Medium-high Residential...,200.0,2017,4,40


## Train and test split with a ratio 85/15

In [22]:
split_index = int(len(df_cleaned) * 0.85)

df_cleaned = df_cleaned.sort_values(by=['Year', 'Quarter'])

In [23]:
df_cleaned[["Year", "Quarter"]].head(100050)

Unnamed: 0,Year,Quarter
125761,2005,3
130110,2005,3
133622,2005,3
133873,2005,3
628004,2005,3
...,...,...
3618839,2007,4
3618858,2007,4
3618870,2007,4
3618892,2007,4


In [24]:
train_df = df_cleaned.iloc[:split_index]
test_df = df_cleaned.iloc[split_index:]

In [25]:
print(test_df[['Year', 'Quarter']].head(100000))

        Year  Quarter
523286  2017        4
523383  2017        4
523395  2017        4
523466  2017        4
523665  2017        4
...      ...      ...
977916  2018        4
977965  2018        4
978100  2018        4
978442  2018        4
978752  2018        4

[100000 rows x 2 columns]


In [26]:
def cap_outliers(series, lower_percentile=0.01, upper_percentile=0.99):
    lower = series.quantile(lower_percentile)
    upper = series.quantile(upper_percentile)
    return np.clip(series, lower, upper)

In [27]:
columns_to_cap = ["Area", "TotalFloorArea","FloorAreaRatio"]
for col in columns_to_cap:    
    train_df[col] = cap_outliers(train_df[col])

In [28]:
missing_numerical = train_df.select_dtypes(include=['float64', 'int64']).isna().sum().sort_values(ascending=False)
missing_percentage = (train_df.select_dtypes(include=['float64', 'int64']).isnull().mean() * 100).sort_values(ascending=False)
print("Missing values in numerical variables:")
print(missing_numerical[missing_numerical > 0])
print("\n")
print(missing_percentage[missing_numerical > 0])

Missing values in numerical variables:
Series([], dtype: int64)


Series([], dtype: float64)


In [29]:
X_train = train_df.drop(columns=["TradePrice"])
y_train = train_df["TradePrice"]
X_test = test_df.drop(columns=["TradePrice"])
y_test = test_df["TradePrice"]

In [None]:
relevant_features = ['MunicipalityCode', 'TotalFloorArea', 'Breadth',
                     'Area', 'BuildingYear', 'Classification', 'FloorAreaRatio',
                     'Frontage', 'Direction', 'Use', 'MaxTimeToNearestStation', 
                     'Code', 'MinTimeToNearestStation'
]
X_train = X_train[relevant_features]
X_test = X_test[relevant_features]
X_test = X_test[relevant_features]

## XGBoost

In [57]:
import xgboost
print(xgboost.__version__)

2.1.3


In [30]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

categorical_cols = train_df.select_dtypes(include=['object']).columns
numerical_columns = train_df.select_dtypes(include=['number']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred_xgb = pipeline.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = root_mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Model Performance:")
print(f"Mean Absolute Error: {mae_xgb:.4f}")
print(f"Mean Squared Error: {mse_xgb:.4f}")
print(f"Root Mean Squared Error: {rmse_xgb:.4f}")
print(f"R-squared: {r2_xgb:.4f}")

In [62]:
categorical_cols = train_df.select_dtypes(include=['object']).columns
numerical_columns = train_df.select_dtypes(include=['number']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred_xgb = pipeline.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = root_mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Model Performance:")
print(f"Mean Absolute Error: {mae_xgb:.4f}")
print(f"Mean Squared Error: {mse_xgb:.4f}")
print(f"Root Mean Squared Error: {rmse_xgb:.4f}")
print(f"R-squared: {r2_xgb:.4f}")

XGBoost Model Performance:
Mean Absolute Error: 17656755.2367
Mean Squared Error: 14770734741483424.0000
Root Mean Squared Error: 121534911.6159
R-squared: 0.5236


## Ensemble Method (Stacking Regressor)

In [34]:
print(X_train.shape)
print(y_train.shape)

(995038, 14)
(995038,)


In [35]:
categorical_cols = train_df.select_dtypes(include=['object']).columns
train_df_num = train_df.drop(columns=categorical_cols)
test_df_num = test_df.drop(columns=categorical_cols)
X_train_num = train_df_num.drop(columns=["TradePrice"])
y_train_num = train_df_num["TradePrice"]
X_test_num = test_df_num.drop(columns=["TradePrice"])
y_test_num = test_df_num["TradePrice"]

In [37]:
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

print(X_train_num.shape, y_train_num.shape)
base_models = [
    ('linear', LinearRegression()),          
    ('gb', GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42)),
    ('xgboost', XGBRegressor(n_estimators=50, learning_rate=0.1, random_state=42))
]

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(),
    n_jobs=-1,  
    passthrough=False  
)

stacking_model.fit(X_train_num, y_train_num)

y_pred_ensemble = stacking_model.predict(X_test_num)

mae_ensemble = mean_absolute_error(y_test_num, y_pred_ensemble)
mse_ensemble = mean_squared_error(y_test_num, y_pred_ensemble)
rmse_ensemble = root_mean_squared_error(y_test_num, y_pred_ensemble)
r2_ensemble = r2_score(y_test_num, y_pred_ensemble)

print("\nEnsemble Model Performance:")
print(f"Mean Absolute Error: {mae_ensemble:.4f}")
print(f"Mean Squared Error: {mse_ensemble:.4f}")
print(f"Root Mean Squared Error: {rmse_ensemble:.4f}")
print(f"R-squared: {r2_ensemble:.4f}")


(995038, 9) (995038,)

Ensemble Model Performance:
Mean Absolute Error: 19256133.3689
Mean Squared Error: 15968662136134026.0000
Root Mean Squared Error: 126367171.9084
R-squared: 0.4849


In [42]:
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

print(X_train_num.shape, y_train_num.shape)
base_models = [
    ('linear', LinearRegression(n_jobs=-1)),          
    (('random_forest', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))),
    ('xgboost', XGBRegressor(n_estimators=150, learning_rate=0.1, random_state=42))
]

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(n_jobs=-1),
    n_jobs=-1,  
    passthrough=False  
)

stacking_model.fit(X_train_num, y_train_num)

y_pred_ensemble = stacking_model.predict(X_test_num)

mae_ensemble = mean_absolute_error(y_test_num, y_pred_ensemble)
mse_ensemble = mean_squared_error(y_test_num, y_pred_ensemble)
rmse_ensemble = root_mean_squared_error(y_test_num, y_pred_ensemble)
r2_ensemble = r2_score(y_test_num, y_pred_ensemble)

print("\nEnsemble Model Performance:")
print(f"Mean Absolute Error: {mae_ensemble:.4f}")
print(f"Mean Squared Error: {mse_ensemble:.4f}")
print(f"Root Mean Squared Error: {rmse_ensemble:.4f}")
print(f"R-squared: {r2_ensemble:.4f}")


(995038, 9) (995038,)

Ensemble Model Performance:
Mean Absolute Error: 18758816.5935
Mean Squared Error: 15816978279806144.0000
Root Mean Squared Error: 125765568.7373
R-squared: 0.4898


In [43]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


categorical_cols = train_df.select_dtypes(include=['object']).columns
numerical_columns = train_df.select_dtypes(include=['number']).columns


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ],
    remainder='passthrough'
)

base_models = [
    ('linear', LinearRegression(n_jobs=-1)),          
    ('random_forest', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
    ('xgboost', XGBRegressor(n_estimators=70, learning_rate=0.1, random_state=42))
]

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(n_jobs=-1),
    n_jobs=-1,  
    passthrough=False  
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking', stacking_model)
])

pipeline.fit(X_train, y_train)

y_pred_ensemble = pipeline.predict(X_test)

mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
rmse_ensemble = root_mean_squared_error(y_test, y_pred_ensemble)
r2_ensemble = r2_score(y_test, y_pred_ensemble)

print("\nEnsemble Model Performance:")
print(f"Mean Absolute Error: {mae_ensemble:.4f}")
print(f"Mean Squared Error: {mse_ensemble:.4f}")
print(f"Root Mean Squared Error: {rmse_ensemble:.4f}")
print(f"R-squared: {r2_ensemble:.4f}")


KeyboardInterrupt: 