In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from pathlib import Path
from sklearn.model_selection import GridSearchCV, train_test_split

pd.set_option("display.max_columns", 500)

np.random.seed(2137) 


In [6]:
df = pd.read_csv('appartments_train.csv')

df.sample(10)

Unnamed: 0,unit_id,obj_type,dim_m2,n_rooms,floor_no,floor_max,year_built,dist_centre,n_poi,dist_sch,dist_clinic,dist_post,dist_kind,dist_rest,dist_uni,dist_pharma,own_type,build_mat,cond_class,has_park,has_balcony,has_lift,has_sec,has_store,price_z,src_month,loc_code,market_volatility,infrastructure_quality,neighborhood_crime_rate,popularity_index,green_space_ratio,estimated_maintenance_cost,global_economic_index
24270,78a1de0708226437,,48.83,2.0,1.0,3.0,1998.0,7.846,11.0,0.234,0.171,0.623,0.222,0.629,1.739,0.101,12631efb,7ceffe3b,53cced8d,yes,yes,no,no,yes,704287.28,2024-03,693f303c,800711.71,32.67,48.48,62.38,0.999,9.04,105.668549
83142,115a285f1e8bbc20,0d6c4dfc,35.53,2.0,3.0,4.0,1970.0,3.96,8.0,0.294,2.154,0.848,0.286,0.268,0.316,0.255,12631efb,7f8c00f9,53cced8d,no,no,no,no,yes,313238.6,2023-08,8d5a4f0c,258100.5,37.87,26.23,48.0,1.0,3.88,104.927936
33582,a76803b46f21fb29,0d6c4dfc,25.87,2.0,3.0,4.0,1963.0,6.563,23.0,0.267,0.552,0.074,0.329,0.09,1.316,0.061,12631efb,7ceffe3b,,no,yes,no,no,yes,493304.86,2024-04,e0cff11b,409311.56,75.92,17.89,50.33,0.999,12.23,90.167666
1257,6424c0db2a193b6b,0d6c4dfc,58.98,3.0,,3.0,2022.0,4.714,10.0,0.317,2.25,1.087,0.264,0.152,2.146,0.569,12631efb,7ceffe3b,,no,yes,yes,no,no,1991412.59,2023-10,693f303c,2008616.8,1.0,3.01,38.72,1.0,23.9,94.343251
9115,390664e65d2bd159,0c238f18,51.72,2.0,3.0,3.0,,0.593,121.0,0.016,0.108,0.258,0.1,0.026,0.111,0.023,12631efb,7ceffe3b,,no,no,no,no,no,583340.31,2023-08,0ab06839,497593.97,,92.23,35.79,1.0,9.7,104.708779
38195,467a68c5b1e2a638,0c238f18,44.99,2.0,3.0,6.0,1961.0,1.315,81.0,0.133,0.535,0.173,0.425,0.015,0.305,0.452,12631efb,7ceffe3b,a2881958,no,no,yes,no,no,696695.01,2023-11,693f303c,760572.57,78.62,54.11,69.87,1.0,6.46,95.629116
77428,223961fb32c9f378,2a6d5c01,55.91,3.0,5.0,6.0,2023.0,0.251,98.0,0.495,0.728,0.312,0.265,0.04,0.946,0.249,12631efb,7ceffe3b,,no,no,yes,no,no,1372652.41,2024-06,e0cff11b,1552494.92,1.3,53.47,69.91,1.0,7.97,107.830152
66138,d405785608a95062,0d6c4dfc,75.11,4.0,3.0,3.0,2011.0,16.98,2.0,0.758,,0.72,0.273,0.364,,0.686,12631efb,7ceffe3b,a2881958,no,yes,no,yes,no,520878.21,2023-08,693f303c,522689.71,8.24,94.83,64.45,0.998,24.96,105.255044
17399,531952a298de2f42,0d6c4dfc,35.35,1.0,,3.0,1965.0,1.664,9.0,0.278,0.783,0.223,0.778,0.468,1.633,0.381,12631efb,,,no,no,no,no,yes,373127.46,2024-01,e0cff11b,381721.48,81.9,2.16,55.92,1.0,6.67,92.882208
81945,cd4b403eca3297f4,2a6d5c01,77.95,4.0,10.0,12.0,2020.0,4.882,17.0,0.355,0.112,0.135,0.322,0.266,2.365,0.062,12631efb,7ceffe3b,,no,no,yes,no,no,1689740.47,2023-12,693f303c,1812499.92,2.06,20.78,44.19,1.0,14.79,93.899531


In [7]:
df = df.drop(columns=['cond_class', 'build_mat', 'green_space_ratio'])

In [8]:
# setting "obj_type" to 'other" if its missing
df['obj_type'] = df['obj_type'].fillna('other')

# using median value (4) for max floor if its missing
df['floor_max'] = df['floor_max'].fillna(4)

# if there is no floor, choose mid of the building
df['floor_no'] = df['floor_no'].fillna(df['floor_max']/2)

# for all distances I'll use the average of mean and median
dist_columns = ['dist_centre', 'dist_sch','dist_clinic','dist_post', 'dist_kind',
                'dist_rest', 'dist_uni', 'dist_pharma']

for col in dist_columns:
    avg_mean_median = (df[col].mean() + df[col].median()) / 2
    df[col] = df[col].fillna(avg_mean_median)

# managing all "has.." variables. If null happens - we will code it as "no". Then change into boolean
has_columns = ['has_park', 'has_balcony', 'has_lift', 'has_sec', 'has_store']

for col in has_columns:
    df[col] = df[col].fillna('no')
    df[col] = df[col].map({'no':0, 'yes':1})


# dealing with other measures. If NA happen, I will use average of mean and median (to mitigate outliers). 

other_measures = ['market_volatility', 'infrastructure_quality', 'neighborhood_crime_rate', 'popularity_index',
                'estimated_maintenance_cost', 'global_economic_index']

for col in other_measures:
    avg_mean_median = (df[col].mean() + df[col].median()) / 2
    df[col] = df[col].fillna(avg_mean_median)

In [9]:
from sklearn.ensemble import RandomForestRegressor

# Step 1: Identify indices
known_mask = df['year_built'].notna()
missing_mask = df['year_built'].isna()

# Step 2: Define features
features = ['floor_no', 'floor_max', 'n_rooms', 'dim_m2', 'dist_centre', 'obj_type',
            'price_z', 'infrastructure_quality', 'has_lift', 'estimated_maintenance_cost']

# Step 3: Prepare known data
df_known = df.loc[known_mask, features + ['year_built']].copy()
X_known = pd.get_dummies(df_known[features])
y_known = df_known['year_built']

# Step 4: Prepare missing data (use same dummies structure!)
df_missing = df.loc[missing_mask, features].copy()
X_missing = pd.get_dummies(df_missing)

# Step 5: Align columns in case of missing dummy columns in either set
X_missing = X_missing.reindex(columns=X_known.columns, fill_value=0)

# Step 6: Train model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_known, y_known)

# Step 7: Predict and impute
df.loc[missing_mask, 'year_built'] = rf.predict(X_missing).round().astype(int)

In [10]:
df['src_month'] = pd.to_datetime(df['src_month'], format='%Y-%m')
df['src_year'] = df['src_month'].dt.year
df['src_month'] = df['src_month'].dt.month

In [11]:
# checking if apartment is on the last floor
df['last_floor'] = np.where(df['floor_no'] == df['floor_max'], 1, 0)

In [12]:
df['room_size'] = df['dim_m2'] / df['n_rooms']

df['apart_age'] = df['src_year'] - df['year_built']

df['is_old_building'] = np.where((df['apart_age'] >= 100), 1, 0)
df['is_new_building'] = np.where((df['apart_age'] <= 10), 1, 0)

df['has_all_amenities'] = np.where((df['has_park'] == 1) & (df['has_balcony'] == 1) & (df['has_lift'] == 1) & (df['has_sec'] == 1), 1, 0)


In [13]:
from sklearn.preprocessing import PowerTransformer

vars_to_transform = [
    'dist_centre', 'dist_uni', 'infrastructure_quality',
    'estimated_maintenance_cost', 'dist_clinic', 'dist_post', 'dist_sch',
    'dist_pharma', 'dist_kind', 'dist_rest', 'n_poi', 'room_size', 'apart_age'
]

pt = PowerTransformer(method='yeo-johnson', standardize=True)
df[vars_to_transform] = pt.fit_transform(df[vars_to_transform])

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Define distance-related variables
distance_vars = [
    'dist_centre', 'dist_sch', 'dist_clinic', 'dist_post', 'dist_kind',
    'dist_rest', 'dist_uni', 'dist_pharma', 'n_poi'
]

# Extract data for PCA
X_dist = df[distance_vars]

# Standardize the distance variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_dist)

# Apply PCA
pca = PCA(n_components=1)
accessibility_scores = pca.fit_transform(X_scaled).ravel()

# Assign meta-variable back to the original DataFrame
df['overall_accessibility'] = accessibility_scores

# Show contribution of each feature to the PCA component
print("PCA component loadings (positive → further, negative → closer):")
print(pd.Series(pca.components_[0], index=distance_vars).sort_values())

df = df.drop(distance_vars, axis = 1)

PCA component loadings (positive → further, negative → closer):
n_poi         -0.438512
dist_kind      0.241474
dist_centre    0.296308
dist_uni       0.317427
dist_post      0.318704
dist_clinic    0.321495
dist_sch       0.341227
dist_pharma    0.343904
dist_rest      0.348141
dtype: float64


In [15]:
df['log_price'] = np.log(df['price_z'])

In [16]:
# turing year into categorical and one hot encoding. Dropping first to omit collinearity

bins = [0, 1900, 1920, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020, 2030]
labels = [
    'before_1900', '1900_1920', '1920_1940', '1940_1950', '1950_1960', '1960_1970',
    '1970_1980', '1980_1990', '1990_2000', '2000_2010', '2010_2020', 'after_2020'
]

df['year_built_cat'] = pd.cut(df['year_built'], bins=bins, labels=labels, right=False)

df = pd.get_dummies(df, columns=['year_built_cat'], prefix='', prefix_sep='', drop_first=True)


df = df.drop(columns=['year_built'], axis =1)

In [17]:
# turing obj_type into dummies - only 4 options
df = pd.get_dummies(df, columns=['obj_type'], prefix='obj_type', drop_first=True)

# similar for own_type
df = pd.get_dummies(df, columns=['own_type'], prefix='own_type', drop_first=True)

# and finally for loc_code
df = pd.get_dummies(df, columns=['loc_code'], prefix='loc_code', drop_first=True)

# and for src_year, why not
df = pd.get_dummies(df, columns=['src_year'], prefix='src_year', drop_first=True)

## MODELLING

In [18]:
# Prepare features and target variable
X = df.drop(columns=['price_z', 'log_price'])  # All features except targetget
y = df['price_z']  # Target variable

# Initial split: 70% training, 30% temporary holdout (stratified)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, 
    y, 
    test_size=0.30, 
    random_state=420  # Reproducibility
)

# Split temporary holdout into validation and test sets (50/50 of the 30%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, 
    y_temp, 
    test_size=0.50, 
    random_state=69  # Reproducibility
)

# Print dataset sizes for verification
print(f"Dataset sizes:")
print(f"Train: {X_train.shape}")
print(f"Validation: {X_val.shape}")
print(f"Test: {X_test.shape}")

# Check for missing values in each dataset
print("\nMissing value counts:")
print(f"Training set: {X_train.isna().sum().sum()} missing values")
print(f"Validation set: {X_val.isna().sum().sum()} missing values")
print(f"Test set: {X_test.isna().sum().sum()} missing values")

Dataset sizes:
Train: (109517, 55)
Validation: (23468, 55)
Test: (23469, 55)

Missing value counts:
Training set: 0 missing values
Validation set: 0 missing values
Test set: 0 missing values


In [19]:
# scaling

to_scale = [
    'dim_m2', 'n_rooms', 'floor_no', 'floor_max', 'src_month',
    'market_volatility', 'infrastructure_quality', 'neighborhood_crime_rate',
    'popularity_index', 'estimated_maintenance_cost', 'global_economic_index',
    'overall_accessibility', 'room_size', 'apart_age'
]

from sklearn.preprocessing import StandardScaler

# Step 1: Fit scaler on training data
scaler = StandardScaler()
X_train[to_scale] = scaler.fit_transform(X_train[to_scale])

# Step 2: Transform validation and test data with the same scaler
X_val[to_scale] = scaler.transform(X_val[to_scale])
X_test[to_scale] = scaler.transform(X_test[to_scale])

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import numpy as np

def train_and_tune_regressor(model, param_grid, X_train, y_train, X_val, y_val, 
                             model_name='Model', cv=5, n_jobs=-1, verbose=1):
    """
    Trains and tunes a regression model using GridSearchCV.
    
    Args:
        model: Base regressor model
        param_grid: Dictionary of hyperparameters to tune
        X_train: Training features
        y_train: Training target
        X_val: Validation features
        y_val: Validation target
        model_name: Name for model identification
        cv: Number of cross-validation folds
        n_jobs: Number of jobs to run in parallel
        verbose: Controls verbosity
        
    Returns:
        best_estimator: The best performing model from GridSearchCV
        results: Dictionary with evaluation metrics
    """
    
    # Create scoring dictionary
    scoring = {
        'RMSE': make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)), greater_is_better=False),
        'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
        'R2': 'r2'
    }

    # Create pipeline with only the regressor
    pipeline = Pipeline(steps=[
        ('regressor', model)
    ])

    # Configure GridSearchCV
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid={'regressor__' + k: v for k, v in param_grid.items()},  # Add regressor prefix
        scoring=scoring,
        refit='RMSE',  # Metric to choose best model
        cv=cv,
        n_jobs=n_jobs,
        verbose=verbose
    )

    # Train model with hyperparameter tuning
    grid.fit(X_train, y_train)

    # Evaluate on validation set
    y_val_pred = grid.predict(X_val)
    
    # Calculate metrics
    val_metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_val, y_val_pred)),
        'MAE': mean_absolute_error(y_val, y_val_pred),
        'R2': r2_score(y_val, y_val_pred)
    }

    # Print results
    print(f"\n{model_name} Results")
    print("=" * 50)
    print("Best parameters:", grid.best_params_)
    print(f"Validation RMSE: {val_metrics['RMSE']:.4f}")
    print(f"Validation MAE: {val_metrics['MAE']:.4f}")
    print(f"Validation R2: {val_metrics['R2']:.4f}")
    print("=" * 50)

    return grid.best_estimator_, val_metrics

In [21]:
from sklearn.linear_model import Ridge

ridge_model = Ridge()

param_grid_ridge = {
    'alpha': [1e-05],
    'fit_intercept': [True],
    'solver': ['auto'],
    'positive': [False]
}

best_ridge, metrics_ridge = train_and_tune_regressor(
    model=ridge_model,
    param_grid=param_grid_ridge,
    X_train=X_train,
    y_train=y_train,
    X_val=X_test,
    y_val=y_test,
    model_name='Ridge Regression'
)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/linear_model/_ridge.py", line 1167, in fit
    X, y = self._validate_data(
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1263, in check_X_y
    X = check_array(
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    array = array.astype(new_dtype)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/generic.py", line 6643, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 430, in astype
    return self.apply(
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 363, in apply
    applied = getattr(b, f)(**kwargs)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/internals/blocks.py", line 758, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 237, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 182, in astype_array
    values = _astype_nansafe(values, dtype, copy=copy)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 133, in _astype_nansafe
    return arr.astype(dtype, copy=True)
ValueError: could not convert string to float: '0c59d09b6459dc36'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/linear_model/_ridge.py", line 1167, in fit
    X, y = self._validate_data(
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1263, in check_X_y
    X = check_array(
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    array = array.astype(new_dtype)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/generic.py", line 6643, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 430, in astype
    return self.apply(
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 363, in apply
    applied = getattr(b, f)(**kwargs)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/internals/blocks.py", line 758, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 237, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 182, in astype_array
    values = _astype_nansafe(values, dtype, copy=copy)
  File "/opt/anaconda3/envs/nlp_workshop/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 133, in _astype_nansafe
    return arr.astype(dtype, copy=True)
ValueError: could not convert string to float: 'ab72911cefb60f2b'
