In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
litvinenko630_real_estate_saint_petersburg_2014_2019_path = kagglehub.dataset_download('litvinenko630/real-estate-saint-petersburg-2014-2019')

print('Data source import complete.')


# Introduction | Real Estate Saint-Petersburg 2014-2019

Current dataset has been acquired from Kaggle datasets.  
It contains info about real estate listings, including various features of properties. Source:  
https://www.kaggle.com/datasets/litvinenko630/real-estate-saint-petersburg-2014-2019/data


### Goal  

Completed version of this notebook will be having a prediction baseline model able to forecast prices of real estate.

## Importing libraries, funcs & data

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(r'data/real_estate_data.csv', sep='\t')

## Data exploration

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().round(decimals=2)

In [None]:


df.isna().sum().sort_values(ascending=False)

In [None]:


df.head()

In [None]:

df.duplicated().sum()

In [None]:
df.nunique().sort_values()

In [None]:



df.dtypes

In [None]:
df.shape

## Data Preprocessing

In [None]:
df['last_price'].sort_values()






There's an anomaly low last price that has to be corrected

In [None]:
df.query('last_price==12190.0')

In [None]:
min_value = min(df['last_price'])
df.loc[df['last_price']==min_value, 'last_price'] = int(min_value*100)

In [None]:



df[df['ponds_nearest'].isna()][ 'ponds_around3000'].value_counts()

In [None]:

df[df['parks_nearest'].isna()][ 'parks_around3000'].value_counts()

In [None]:
features_to_compare = ['airports_nearest', 'cityCenters_nearest']

df[features_to_compare] = df[features_to_compare].fillna(0)


In [None]:


from sklearn.metrics import pairwise_distances
def fulfill_by_closest(
        df: pd.DataFrame,
        features: list[str],
        features_to_compare: list[str]
):
    for feature in features:
        known_values = df[df[feature].notna()]
        unknown_values = df[df[feature].isna()]
        distances = pairwise_distances(unknown_values[features_to_compare], known_values[features_to_compare], metric='euclidean')
        nearest_indices = distances.argmin(axis=1)
        df.loc[df[feature].isna(), feature] = known_values.iloc[nearest_indices][feature].values

In [None]:
features_to_fill = ['parks_nearest','ponds_nearest','ponds_around3000','parks_around3000']
fulfill_by_closest(
    df = df,
    features = features_to_fill,
    features_to_compare = features_to_compare
)

In [None]:

df.dropna(subset=['locality_name'], inplace=True)

In [None]:


df.loc[df['floors_total'].isna(), 'floors_total'] = df.loc[df['floors_total'].isna(), 'floor']

In [None]:

df = df[df['floors_total']<=37]

In [None]:
df.loc[df['balcony'].isna()==True,'balcony']=0

In [None]:
grouped = df[df['ceiling_height'].notna()].groupby(['locality_name'])

In [None]:
df['ceiling_height'] = df.groupby('locality_name')['ceiling_height'].transform(lambda x: x.fillna(x.mean()))


In [None]:

df.dropna(subset=['ceiling_height'], inplace=True)

In [None]:
df.shape

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.loc[df['kitchen_area'].isna(),'kitchen_area'] = np.floor(df['kitchen_area'].mean())
df.loc[df['living_area'].isna(),'living_area']= np.floor(df['living_area'].mean())


In [None]:
df = df[df['total_area']>20]

In [None]:
df.shape

In [None]:
df.loc[:,'non_living_area'] = df['total_area']-(df['living_area']+df['kitchen_area'])

In [None]:
df = df[df['non_living_area']>0]

In [None]:
df.fillna(value=0, inplace=True)

In [None]:
df['first_day_exposition'] = pd.to_datetime(df['first_day_exposition'])
df['exposition_year'] = df['first_day_exposition'].dt.year
df['exposition_month'] = df['first_day_exposition'].dt.month
df['exposition_day'] = df['first_day_exposition'].dt.day
df['exposition_weekday'] = df['first_day_exposition'].dt.weekday
df['studio'] = df['studio'].astype(int)
df['open_plan'] = df['open_plan'].astype(int)

In [None]:

df['log_last_price'] = np.log1p(df['last_price'])

## Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sns
def create_matrix_correlation(data):
    corr = data.corr()
    fig = plt.figure(figsize=(10,10))
    sns.heatmap(
        corr, annot=True, fmt='.2f',cmap='coolwarm',cbar=True
    )
    fig.tight_layout()
    plt.show()

In [None]:
def draw_distribution_graphic(x: pd.Series)->None:
    fig, ax = plt.subplots(figsize = (12, 6))
    sns.histplot(x, kde = True)
    ax.set_title("Target distribution", fontsize = 15, pad = 10, loc = 'left')
    ax.set_xlabel("Price - log transformed", fontsize = 8)
    ax.set_ylabel("Frequency", fontsize = 8)
    plt.show()

In [None]:
df.head()

In [None]:

create_matrix_correlation(df.drop(['locality_name','first_day_exposition'],axis=1))

In [None]:
['total_area', 'rooms', 'living_area', 'is_apartment', 'kitchen_area',
       'airports_nearest', 'parks_nearest', 'non_living_area']

In [None]:
X = df.drop(['last_price','first_day_exposition','locality_name','studio', 'open_plan','log_last_price'], axis=1)

y = df['log_last_price']

In [None]:
draw_distribution_graphic(
    x=df.last_price
)

In [None]:
draw_distribution_graphic(
    x=df.log_last_price
)


## Feature Selection

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:

from sklearn.feature_selection import f_classif, SelectKBest

selector = SelectKBest(
    score_func=f_classif,
    k=12
)
X_new = selector.fit_transform(X,y)

In [None]:

X_new

In [None]:
y

In [None]:

selected_indices = selector.get_support(indices=True)
print("Selected feature indices:", selected_indices)

In [None]:
selected_feature_names = X.columns[selected_indices]
print("Selected feature names:", selected_feature_names)

## Splitting Data into Train, Validation & Test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_new,
    y,
    train_size = 0.8,
    test_size = 0.2
)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    train_size = 0.9,
    test_size = 0.1
)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

## Model building & training

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from dataclasses import dataclass
import abc
from typing import Dict, Any, Type
from sklearn.model_selection import GridSearchCV


@dataclass
class ModelBuildingHelper:
    model_class: Type[abc.ABCMeta]
    X_train: pd.DataFrame
    X_val: pd.DataFrame
    y_train: pd.Series
    y_val: pd.Series
    test_x_data: pd.DataFrame
    test_y_data: pd.Series
    params_grid: Dict[str, Any]  = None
    bagging_params: Dict[str, Any] = None
    def __post_init__(self):
        self.model_instance = self.model_class
        if self.params_grid:
            self.grid_search = GridSearchCV(
                estimator = self.model_instance,
                param_grid =  self.params_grid,
                cv = 5
            )
        else:
            self.grid_search = None

        if self.bagging_params:
            self.bagging_model = BaggingRegressor(
                estimator=self.model_instance,
                **self.bagging_params
            )
        else:
            self.bagging_model = None

    def train_model(self):
        if self.grid_search:
            self.grid_search.fit(self.X_train, self.y_train)
            self.model_instance = self.grid_search.best_estimator_
        else:
            self.model_instance.fit(self.X_train, self.y_train)


    def evaluate_model(self):
        y_val_pred = self.model_instance.predict(self.X_val)
        self.metrics(y_real=self.y_val,
                     y_pred=y_val_pred)
    def metrics(self, y_real, y_pred):
        mae = mean_absolute_error(y_real,y_pred)
        mse = mean_squared_error(y_real, y_pred)
        r2 = r2_score(y_real, y_pred)
        print(f'metrics:\n'
              f'mae {mae}\n'
              f'mse {mse}\n'
              f'r2 {r2}\n')

    def get_prediction(self):
        test_pred_data = self.model_instance.predict(self.test_x_data)
        self.metrics(self.test_y_data, test_pred_data)
    def get_best_params(self):
        if self.grid_search:
            return self.grid_search.best_params_
        else:
            raise ValueError("GridSearchCV hasn't been completed yet")

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

linear_regression = ModelBuildingHelper(
    model_class = model,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val
)

In [None]:
linear_regression.train_model()

In [None]:
linear_regression.evaluate_model()

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
parameters = {'n_estimators': [300],
              'max_depth': [20],
              'min_samples_split': [4],
              'min_samples_leaf': [2,4]
              }

randomforest_regressor = ModelBuildingHelper(
    model_class = model,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val,
    params_grid=parameters,
)

In [None]:
randomforest_regressor.train_model()

In [None]:
randomforest_regressor.evaluate_model()

In [None]:
randomforest_regressor.get_best_params()

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(
    eval_metric='rmse',
)

bagging_params = {
    'n_estimators': 5,
    'random_state': 42
}

parameters = {'n_estimators': [200,400],
              'learning_rate': [0.1],
              'max_depth': [5],
              'subsample': [1.0],
              'colsample_bytree': [0.6, 0.8],
              'alpha': [1.0],
              'lambda': [1.0],
              'min_child_weight': [6,8,10]
              }

xgb_regressor = ModelBuildingHelper(
    model_class = model,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val,
    params_grid=parameters,
    bagging_params=bagging_params
)


In [None]:
xgb_regressor.train_model()

In [None]:

xgb_regressor.evaluate_model()

In [None]:
xgb_regressor.get_best_params()

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

model = ExtraTreesRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

In [None]:
extra_trees_regressor = ModelBuildingHelper(
    model_class = model,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val,
)
extra_trees_regressor.train_model()

In [None]:
extra_trees_regressor.evaluate_model()

In [None]:
extra_trees_regressor.get_prediction()

In [None]:

import lightgbm as lgb
model = lgb.LGBMRegressor(
    objective='regression',
    boosting_type='gbdt',
    random_state=42,
    verbose=-1
)

parameters = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.01, 0.1],
    'max_depth': [5, 7],
    'num_leaves': [20, 31],
    'min_child_samples': [10],
    'subsample': [0.6],
    'colsample_bytree': [0.6],
}

lgb_regressor = ModelBuildingHelper(
    model_class=model,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val,
    params_grid=parameters,
)



In [None]:
lgb_regressor.train_model()

In [None]:
lgb_regressor.evaluate_model()

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    loss_function='RMSE',
    random_state=42,
    verbose=0
)


In [None]:
catboost_regressor = ModelBuildingHelper(
    model_class=model,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val,
)

In [None]:
catboost_regressor.train_model()

In [None]:
catboost_regressor.evaluate_model()

In [None]:
from sklearn.linear_model import Lasso

model = Lasso()

parameters = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'max_iter': [1000]
}

lasso_regressor = ModelBuildingHelper(
    model_class=model,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val,
    params_grid=parameters
)


In [None]:
lasso_regressor.train_model()

In [None]:
lasso_regressor.evaluate_model()

In [None]:
from sklearn.ensemble import StackingRegressor
base_learners = [
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_split=4,
        min_samples_leaf=4
    )),
    ('xgb', XGBRegressor(
        n_estimators=400,
        max_depth=5,
        alpha=1.0,
        colsample_bytree=0.6,
        min_child_weight=10,
        subsample=1.0,
        learning_rate=0.1)),
    ('cbr', CatBoostRegressor(
        iterations=500,
        depth=6,
        learning_rate=0.1,
        loss_function='RMSE',
        random_state=42,
        verbose=0
    ))
]

stacking_regressor_4 = StackingRegressor(
    estimators=base_learners,
    final_estimator=RandomForestRegressor(
        n_estimators=400)
)

stacking_model =  ModelBuildingHelper(
    model_class=stacking_regressor_4,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val,
)

In [None]:
stacking_model.train_model()

In [None]:
stacking_model.evaluate_model()

In [None]:
base_learners = [
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_split=4,
        min_samples_leaf=4
    )),
]


stacking_regressor_2 = StackingRegressor(
    estimators=base_learners,
    final_estimator=RandomForestRegressor(
        n_estimators=400)
)

stacking_model =  ModelBuildingHelper(
    model_class=stacking_regressor_2,
    X_train=X_train_scaled,
    X_val=X_test_scaled,
    y_train=y_train,
    y_val=y_test,
    test_x_data=X_val_scaled,
    test_y_data=y_val,
)


In [None]:
stacking_model.train_model()

In [None]:
stacking_model.evaluate_model()

## Model comparison

In [None]:
linear_regression.get_prediction()

In [None]:
randomforest_regressor.get_prediction()

In [None]:
lgb_regressor.get_prediction()

In [None]:
xgb_regressor.get_prediction()

In [None]:
catboost_regressor.get_prediction()

In [None]:
lasso_regressor.get_prediction()

In [None]:
stacking_model.get_prediction()


In [None]:
stacking_model.get_prediction()

## Conclusions

The preprocessing involved filling missing values using the nearest neighbors approach and applying  
a log transformation to normalize the last_price feature. Feature selection was done using  
SelectKBest with f_classif, retaining the top 12 features. Data was scaled with   
StandardScaler to ensure uniform contribution from all features.

After training and optimizing various models, it was found that the CatBoost Regressor delivered the best performance.   
The results were summarized as follows:  
CatBoost Regressor  
MAE: 0.160  
MSE: 0.056	  
R_2: 0.865  

CatBoost Regressor achieved the lowest MAE and MSE, and the highest R² score,  
making it the most effective model in this analysis.  
Overall, the preprocessing and feature selection steps contributed significantly  
to the model performance, and stacking models helped in leveraging the strengths of different algorithms.

To further improve the results, additional data preprocessing steps such as advanced feature  
 engineering and more extensive hyperparameter tuning techniques could be considered.