In [50]:
from sklearn.model_selection import GridSearchCV, train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from pathlib import Path

pd.set_option("display.max_columns", 500)

np.random.seed(2137) 


In [64]:
df = pd.read_csv('new_train_df.csv')

In [65]:
df.head()

Unnamed: 0,dim_m2,n_rooms,floor_no,floor_max,has_park,has_balcony,has_lift,has_sec,has_store,price_z,src_month,market_volatility,infrastructure_quality,neighborhood_crime_rate,popularity_index,estimated_maintenance_cost,global_economic_index,last_floor,overall_accessibility,log_price,1900_1920,1920_1940,1940_1950,1950_1960,1960_1970,1970_1980,1980_1990,1990_2000,2000_2010,2010_2020,after_2020,obj_type_0d6c4dfc,obj_type_2a6d5c01,obj_type_other,own_type_4e625087,own_type_bfb8fe10,loc_code_143768f7,loc_code_378f340c,loc_code_3cb4aaff,loc_code_533f6886,loc_code_570cb745,loc_code_64a58667,loc_code_6900ba06,loc_code_693f303c,loc_code_765f79ed,loc_code_81b10147,loc_code_8d5a4f0c,loc_code_a6d54bd1,loc_code_e0cff11b,loc_code_ece39f3d,src_year_2024
0,45.89,2.0,1.0,4.0,1,1,1,0,1,519626.21,9,501710.76,-0.55305,95.39,44.51,-0.165329,100.291946,0,1.465493,13.160865,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
1,27.64,1.0,1.0,2.0,1,1,0,0,0,162959.26,4,147763.87,1.673214,46.17,56.25,-1.190179,91.315644,0,-0.738866,12.001256,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True
2,62.18,2.0,1.0,2.0,0,1,0,0,1,1167571.51,10,1042847.59,0.13791,18.94,50.36,0.576055,93.681619,0,2.194918,13.970437,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
3,53.68,2.0,2.0,4.0,1,0,1,0,0,907071.16,1,728839.39,-0.973734,11.84,46.69,-1.124044,94.192062,0,0.392663,13.717976,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True
4,70.89,3.0,2.0,3.0,1,1,1,0,0,1080383.19,11,1263171.15,-0.715781,89.64,45.6,-0.982982,96.166051,0,1.428269,13.892826,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False


In [66]:
print(df.columns)
print(df.shape)

Index(['dim_m2', 'n_rooms', 'floor_no', 'floor_max', 'has_park', 'has_balcony',
       'has_lift', 'has_sec', 'has_store', 'price_z', 'src_month',
       'market_volatility', 'infrastructure_quality',
       'neighborhood_crime_rate', 'popularity_index',
       'estimated_maintenance_cost', 'global_economic_index', 'last_floor',
       'overall_accessibility', 'log_price', '1900_1920', '1920_1940',
       '1940_1950', '1950_1960', '1960_1970', '1970_1980', '1980_1990',
       '1990_2000', '2000_2010', '2010_2020', 'after_2020',
       'obj_type_0d6c4dfc', 'obj_type_2a6d5c01', 'obj_type_other',
       'own_type_4e625087', 'own_type_bfb8fe10', 'loc_code_143768f7',
       'loc_code_378f340c', 'loc_code_3cb4aaff', 'loc_code_533f6886',
       'loc_code_570cb745', 'loc_code_64a58667', 'loc_code_6900ba06',
       'loc_code_693f303c', 'loc_code_765f79ed', 'loc_code_81b10147',
       'loc_code_8d5a4f0c', 'loc_code_a6d54bd1', 'loc_code_e0cff11b',
       'loc_code_ece39f3d', 'src_year_2024'],
   

In [67]:
# Prepare features and target variable
X = df.drop(columns=['price_z', 'log_price'])  # All features except targetget
y = df['price_z']  # Target variable

# Initial split: 70% training, 30% temporary holdout (stratified)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, 
    y, 
    test_size=0.30, 
    random_state=420  # Reproducibility
)

# Split temporary holdout into validation and test sets (50/50 of the 30%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, 
    y_temp, 
    test_size=0.50, 
    random_state=69  # Reproducibility
)

# Print dataset sizes for verification
print(f"Dataset sizes:")
print(f"Train: {X_train.shape}")
print(f"Validation: {X_val.shape}")
print(f"Test: {X_test.shape}")

# Check for missing values in each dataset
print("\nMissing value counts:")
print(f"Training set: {X_train.isna().sum().sum()} missing values")
print(f"Validation set: {X_val.isna().sum().sum()} missing values")
print(f"Test set: {X_test.isna().sum().sum()} missing values")

Dataset sizes:
Train: (109517, 49)
Validation: (23468, 49)
Test: (23469, 49)

Missing value counts:
Training set: 0 missing values
Validation set: 0 missing values
Test set: 0 missing values


In [68]:
print(X.columns)

Index(['dim_m2', 'n_rooms', 'floor_no', 'floor_max', 'has_park', 'has_balcony',
       'has_lift', 'has_sec', 'has_store', 'src_month', 'market_volatility',
       'infrastructure_quality', 'neighborhood_crime_rate', 'popularity_index',
       'estimated_maintenance_cost', 'global_economic_index', 'last_floor',
       'overall_accessibility', '1900_1920', '1920_1940', '1940_1950',
       '1950_1960', '1960_1970', '1970_1980', '1980_1990', '1990_2000',
       '2000_2010', '2010_2020', 'after_2020', 'obj_type_0d6c4dfc',
       'obj_type_2a6d5c01', 'obj_type_other', 'own_type_4e625087',
       'own_type_bfb8fe10', 'loc_code_143768f7', 'loc_code_378f340c',
       'loc_code_3cb4aaff', 'loc_code_533f6886', 'loc_code_570cb745',
       'loc_code_64a58667', 'loc_code_6900ba06', 'loc_code_693f303c',
       'loc_code_765f79ed', 'loc_code_81b10147', 'loc_code_8d5a4f0c',
       'loc_code_a6d54bd1', 'loc_code_e0cff11b', 'loc_code_ece39f3d',
       'src_year_2024'],
      dtype='object')


In [69]:
# scaling

to_scale = [
    'dim_m2', 'n_rooms', 'floor_no', 'floor_max', 'src_month',
    'market_volatility', 'infrastructure_quality', 'neighborhood_crime_rate',
    'popularity_index', 'estimated_maintenance_cost', 'global_economic_index',
    'overall_accessibility'
]

from sklearn.preprocessing import StandardScaler

# Step 1: Fit scaler on training data
scaler = StandardScaler()
X_train[to_scale] = scaler.fit_transform(X_train[to_scale])

# Step 2: Transform validation and test data with the same scaler
X_val[to_scale] = scaler.transform(X_val[to_scale])
X_test[to_scale] = scaler.transform(X_test[to_scale])

In [70]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import numpy as np

def train_and_tune_regressor(model, param_grid, X_train, y_train, X_val, y_val, 
                             model_name='Model', cv=5, n_jobs=-1, verbose=1):
    """
    Trains and tunes a regression model using GridSearchCV.
    
    Args:
        model: Base regressor model
        param_grid: Dictionary of hyperparameters to tune
        X_train: Training features
        y_train: Training target
        X_val: Validation features
        y_val: Validation target
        model_name: Name for model identification
        cv: Number of cross-validation folds
        n_jobs: Number of jobs to run in parallel
        verbose: Controls verbosity
        
    Returns:
        best_estimator: The best performing model from GridSearchCV
        results: Dictionary with evaluation metrics
    """
    
    # Create scoring dictionary
    scoring = {
        'RMSE': make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)), greater_is_better=False),
        'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
        'R2': 'r2'
    }

    # Create pipeline with only the regressor
    pipeline = Pipeline(steps=[
        ('regressor', model)
    ])

    # Configure GridSearchCV
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid={'regressor__' + k: v for k, v in param_grid.items()},  # Add regressor prefix
        scoring=scoring,
        refit='RMSE',  # Metric to choose best model
        cv=cv,
        n_jobs=n_jobs,
        verbose=verbose
    )

    # Train model with hyperparameter tuning
    grid.fit(X_train, y_train)

    # Evaluate on validation set
    y_val_pred = grid.predict(X_val)
    
    # Calculate metrics
    val_metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_val, y_val_pred)),
        'MAE': mean_absolute_error(y_val, y_val_pred),
        'R2': r2_score(y_val, y_val_pred)
    }

    # Print results
    print(f"\n{model_name} Results")
    print("=" * 50)
    print("Best parameters:", grid.best_params_)
    print(f"Validation RMSE: {val_metrics['RMSE']:.4f}")
    print(f"Validation MAE: {val_metrics['MAE']:.4f}")
    print(f"Validation R2: {val_metrics['R2']:.4f}")
    print("=" * 50)

    return grid.best_estimator_, val_metrics

In [71]:
from sklearn.linear_model import LinearRegression

# 1. Model
lr_model = LinearRegression()

# 2. Parametry do tuningu (ograniczone – można np. dodać fit_intercept i normalize, choć normalize jest deprecated)
param_grid_lr = {
    'fit_intercept': [True],
    'positive': [False]
}

# 3. Wywołanie funkcji
best_lr, metrics_lr = train_and_tune_regressor(
    model=lr_model,
    param_grid=param_grid_lr,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    model_name='Linear Regression'
)


Fitting 5 folds for each of 1 candidates, totalling 5 fits

Linear Regression Results
Best parameters: {'regressor__fit_intercept': True, 'regressor__positive': False}
Validation RMSE: 95184.4110
Validation MAE: 71349.0466
Validation R2: 0.9507


In [72]:
from sklearn.linear_model import Ridge

ridge_model = Ridge()

param_grid_ridge = {
    'alpha': [0.00001, 1.0],
    'fit_intercept': [True],
    'solver': ['auto'],
    'positive': [False]
}

best_ridge, metrics_ridge = train_and_tune_regressor(
    model=ridge_model,
    param_grid=param_grid_ridge,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    model_name='Ridge Regression'
)

Fitting 5 folds for each of 2 candidates, totalling 10 fits

Ridge Regression Results
Best parameters: {'regressor__alpha': 1.0, 'regressor__fit_intercept': True, 'regressor__positive': False, 'regressor__solver': 'auto'}
Validation RMSE: 95184.3651
Validation MAE: 71348.8084
Validation R2: 0.9507


In [73]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(max_iter=5000)

param_grid_lasso = {
    'alpha': [0.00001, 1.0],
    'fit_intercept': [True],
    'positive': [False]
}

best_lasso, metrics_lasso = train_and_tune_regressor(
    model=lasso_model,
    param_grid=param_grid_lasso,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    model_name='Lasso Regression'
)


Fitting 5 folds for each of 2 candidates, totalling 10 fits

Lasso Regression Results
Best parameters: {'regressor__alpha': 1.0, 'regressor__fit_intercept': True, 'regressor__positive': False}
Validation RMSE: 95184.3345
Validation MAE: 71349.5858
Validation R2: 0.9507


In [74]:
from sklearn.linear_model import ElasticNet

elastic_model = ElasticNet(max_iter=5000)

param_grid_elastic = {
    'alpha': [0.00001, 1.0],
    'l1_ratio': [0.01, 1.0],
    'fit_intercept': [True]
}

best_elastic, metrics_elastic = train_and_tune_regressor(
    model=elastic_model,
    param_grid=param_grid_elastic,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    model_name='ElasticNet'
)


Fitting 5 folds for each of 4 candidates, totalling 20 fits

ElasticNet Results
Best parameters: {'regressor__alpha': 1.0, 'regressor__fit_intercept': True, 'regressor__l1_ratio': 1.0}
Validation RMSE: 95184.3345
Validation MAE: 71349.5858
Validation R2: 0.9507


In [75]:
from sklearn.linear_model import Ridge

ridge_model = Ridge()

param_grid_ridge = {
    'alpha': [1.0],
    'fit_intercept': [True],
    'solver': ['auto'],
    'positive': [False]
}

best_ridge, metrics_ridge = train_and_tune_regressor(
    model=ridge_model,
    param_grid=param_grid_ridge,
    X_train=X_train,
    y_train=y_train,
    X_val=X_test,
    y_val=y_test,
    model_name='Ridge Regression'
)


Fitting 5 folds for each of 1 candidates, totalling 5 fits

Ridge Regression Results
Best parameters: {'regressor__alpha': 1.0, 'regressor__fit_intercept': True, 'regressor__positive': False, 'regressor__solver': 'auto'}
Validation RMSE: 95551.3720
Validation MAE: 71304.1161
Validation R2: 0.9507
