In [16]:
import logging
import pandas as pd
import numpy as np
import json
import zipfile
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import optuna
from optuna.samplers import TPESampler

def add_engineered_features(df):
    # Handle zero division
    df['guests_per_room'] = df['guests'] / df['rooms'].replace(0, np.nan)

    # Create a binary feature for "apartment" in name
    if "name" in df.columns:
        df["name"] = df["name"].astype(str).fillna("")
        df["is_studio"] = df["name"].str.lower().str.contains("studio|wifi|residence|cosy|courtyard|train").astype(int)

    # Generate interaction term between lat and lon
    if "lat" in df.columns and "lon" in df.columns:
        df["lat_lon_interaction"] = df["lat"] * df["lon"]
        city_center_lat, city_center_lon = df["lat"].mean(), df["lon"].mean()

    return df

def baseline():
    logging.info("Reading train and test files")
    train = pd.read_json("train.json", orient='records')
    test = pd.read_json("test.json", orient='records')
    train, valid = train_test_split(train, test_size=1/3, random_state=123)

    # Apply feature engineering
    train = add_engineered_features(train)
    valid = add_engineered_features(valid)
    test = add_engineered_features(test)

    preprocess = ColumnTransformer(
        transformers=[
        ("lat", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lat"]),
        ("lon", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lon"]),
        ("lat_lon_interaction", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lat_lon_interaction"]),
        ("min_nights", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["min_nights"]),
        ("num_reviews", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["num_reviews"]),
        ("guests", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["guests"]),
        ("cancellation", Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ["cancellation"]),
        ("guests_per_room", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["guests_per_room"]),
        ("rating", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["rating"]),
        ("bathrooms", Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]), ["bathrooms"]),
        ("listing_type", Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), ["listing_type"]),
        ("room_type", Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ]), ["room_type"]),
        ("is_studio", Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]), ["is_studio"]),

    ],
    remainder='drop'
    )

    label = 'revenue'
    
    # Preprocess the data once
    X_train = preprocess.fit_transform(train.drop([label], axis=1))
    y_train = np.log1p(train[label].values)
    X_valid = preprocess.transform(valid.drop([label], axis=1))
    y_valid = np.log1p(valid[label].values)
    X_test = preprocess.transform(test)

    # Optuna hyperparameter tuning for XGBoost
    logging.info("Starting Optuna tuning for XGBoost")
    
    def objective_xgb(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'objective': 'reg:squarederror',
            'random_state': 123,
            'verbosity': 0
        }
        
        model = xgb.XGBRegressor(**params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        
        pred = np.expm1(model.predict(X_valid))
        mae = mean_absolute_error(valid[label], pred)
        return mae
    
    study_xgb = optuna.create_study(direction='minimize', sampler=TPESampler(seed=123))
    study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)
    
    logging.info(f"Best XGBoost MAE: {study_xgb.best_value:.3f}")
    logging.info(f"Best XGBoost params: {study_xgb.best_params}")
    
    # Optuna hyperparameter tuning for CatBoost
    logging.info("Starting Optuna tuning for CatBoost")
    
    def objective_catboost(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 100, 500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'depth': trial.suggest_int('depth', 4, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 10),
            'random_state': 123,
            'verbose': False
        }
        
        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=False)
        
        pred = np.expm1(model.predict(X_valid))
        mae = mean_absolute_error(valid[label], pred)
        return mae
    
    study_catboost = optuna.create_study(direction='minimize', sampler=TPESampler(seed=123))
    study_catboost.optimize(objective_catboost, n_trials=50, show_progress_bar=True)
    
    logging.info(f"Best CatBoost MAE: {study_catboost.best_value:.3f}")
    logging.info(f"Best CatBoost params: {study_catboost.best_params}")
    
    # Train best models
    logging.info("Training best models on full training data")
    
    best_xgb_params = study_xgb.best_params.copy()
    best_xgb_params.update({'objective': 'reg:squarederror', 'random_state': 123, 'verbosity': 0})
    best_xgb_model = xgb.XGBRegressor(**best_xgb_params)
    best_xgb_model.fit(X_train, y_train)
    
    best_catboost_params = study_catboost.best_params.copy()
    best_catboost_params.update({'random_state': 123, 'verbose': False})
    best_catboost_model = CatBoostRegressor(**best_catboost_params)
    best_catboost_model.fit(X_train, y_train)
    
    # Generate predictions
    xgb_pred_valid = np.expm1(best_xgb_model.predict(X_valid))
    catboost_pred_valid = np.expm1(best_catboost_model.predict(X_valid))
    
    xgb_pred_test = np.expm1(best_xgb_model.predict(X_test))
    catboost_pred_test = np.expm1(best_catboost_model.predict(X_test))
    
    # Test different ensemble weights
    logging.info("\n" + "="*60)
    logging.info("Testing different ensemble strategies")
    logging.info("="*60)
    
    ensemble_strategies = {
        'XGBoost only': (1.0, 0.0),
        'CatBoost only': (0.0, 1.0),
        'XGBoost dominant (0.7/0.3)': (0.7, 0.3),
        'Balanced (0.5/0.5)': (0.5, 0.5),
        'CatBoost dominant (0.3/0.7)': (0.3, 0.7)
    }
    
    results = []
    best_strategy = None
    best_mae = float('inf')
    
    for strategy_name, (xgb_weight, catboost_weight) in ensemble_strategies.items():
        ensemble_pred_valid = xgb_weight * xgb_pred_valid + catboost_weight * catboost_pred_valid
        
        mae = mean_absolute_error(valid[label], ensemble_pred_valid)
        r2 = r2_score(valid[label], ensemble_pred_valid)
        rmse = np.sqrt(mean_squared_error(valid[label], ensemble_pred_valid))
        
        results.append({
            'strategy': strategy_name,
            'xgb_weight': xgb_weight,
            'catboost_weight': catboost_weight,
            'mae': mae,
            'r2': r2,
            'rmse': rmse
        })
        
        logging.info(f"\n{strategy_name}:")
        logging.info(f"  XGB weight: {xgb_weight:.1f}, CatBoost weight: {catboost_weight:.1f}")
        logging.info(f"  MAE: {mae:.3f}")
        logging.info(f"  R²: {r2:.4f}")
        logging.info(f"  RMSE: {rmse:.2f}")
        
        if mae < best_mae:
            best_mae = mae
            best_strategy = strategy_name
            best_weights = (xgb_weight, catboost_weight)
    
    logging.info("\n" + "="*60)
    logging.info(f"Best strategy: {best_strategy} with MAE: {best_mae:.3f}")
    logging.info("="*60)
    
    # Use best ensemble for final predictions
    best_xgb_weight, best_catboost_weight = best_weights
    final_pred_test = best_xgb_weight * xgb_pred_test + best_catboost_weight * catboost_pred_test
    
    test[label] = final_pred_test
    predicted = test[['revenue']].to_dict(orient='records')

    with zipfile.ZipFile("baseline.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
        zipf.writestr("predicted.json", json.dumps(predicted, indent=2))
    
    # Save results summary
    results_df = pd.DataFrame(results)
    results_df.to_csv("ensemble_comparison.csv", index=False)
    logging.info("\nEnsemble comparison saved to 'ensemble_comparison.csv'")
    
    print(f"\nFinal Results (Best Strategy: {best_strategy}):")
    print(f"MAE: {best_mae:.3f}")
    print(f"R²: {results_df[results_df['strategy'] == best_strategy]['r2'].values[0]:.4f}")
    print(f"RMSE: {results_df[results_df['strategy'] == best_strategy]['rmse'].values[0]:.2f}")

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    baseline()

INFO:root:Reading train and test files


INFO:root:Starting Optuna tuning for XGBoost
[I 2025-12-02 17:24:57,751] A new study created in memory with name: no-name-f5b7b739-bb1d-4628-9c18-6f6896572adc


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-02 17:24:59,085] Trial 0 finished with value: 9532.935546875 and parameters: {'n_estimators': 239, 'learning_rate': 0.02646442689360642, 'max_depth': 4, 'subsample': 0.7756573845414456, 'colsample_bytree': 0.8597344848927815, 'min_child_weight': 3, 'gamma': 4.9038209919230775}. Best is trial 0 with value: 9532.935546875.
[I 2025-12-02 17:25:00,339] Trial 1 finished with value: 8667.4912109375 and parameters: {'n_estimators': 237, 'learning_rate': 0.051332773974241, 'max_depth': 6, 'subsample': 0.6715890080754348, 'colsample_bytree': 0.8645248536920208, 'min_child_weight': 4, 'gamma': 0.29838948304784174}. Best is trial 1 with value: 8667.4912109375.
[I 2025-12-02 17:25:01,114] Trial 2 finished with value: 8990.8515625 and parameters: {'n_estimators': 180, 'learning_rate': 0.12305767336794743, 'max_depth': 4, 'subsample': 0.5877258780737462, 'colsample_bytree': 0.7657756869209191, 'min_child_weight': 4, 'gamma': 3.1720047927566055}. Best is trial 1 with value: 8667.4912109375

INFO:root:Best XGBoost MAE: 8570.290
INFO:root:Best XGBoost params: {'n_estimators': 284, 'learning_rate': 0.07921839266514279, 'max_depth': 6, 'subsample': 0.8416129574553262, 'colsample_bytree': 0.5889950837451522, 'min_child_weight': 5, 'gamma': 0.49613522706236707}
INFO:root:Starting Optuna tuning for CatBoost
[I 2025-12-02 17:26:00,384] A new study created in memory with name: no-name-736213b4-49b4-4b92-9f53-5aee08728d6f


[I 2025-12-02 17:26:00,372] Trial 49 finished with value: 8610.32421875 and parameters: {'n_estimators': 290, 'learning_rate': 0.05852953011445315, 'max_depth': 5, 'subsample': 0.8883180101123993, 'colsample_bytree': 0.5845202678248725, 'min_child_weight': 4, 'gamma': 0.6101437009053172}. Best is trial 45 with value: 8570.2900390625.


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-02 17:26:04,311] Trial 0 finished with value: 9341.91245836899 and parameters: {'iterations': 379, 'learning_rate': 0.02646442689360642, 'depth': 5, 'subsample': 0.7756573845414456, 'colsample_bylevel': 0.8597344848927815, 'min_child_samples': 5}. Best is trial 0 with value: 9341.91245836899.
[I 2025-12-02 17:26:10,970] Trial 1 finished with value: 8702.730810960478 and parameters: {'iterations': 493, 'learning_rate': 0.10270144703457969, 'depth': 7, 'subsample': 0.6960587590970753, 'colsample_bylevel': 0.6715890080754348, 'min_child_samples': 8}. Best is trial 1 with value: 8702.730810960478.
[I 2025-12-02 17:26:13,846] Trial 2 finished with value: 10304.853250374787 and parameters: {'iterations': 275, 'learning_rate': 0.012250434413464737, 'depth': 6, 'subsample': 0.8689977028660179, 'colsample_bylevel': 0.59124586522675, 'min_child_samples': 2}. Best is trial 1 with value: 8702.730810960478.
[I 2025-12-02 17:26:17,881] Trial 3 finished with value: 8730.465542687189 and pa

INFO:root:Best CatBoost MAE: 8607.846
INFO:root:Best CatBoost params: {'iterations': 477, 'learning_rate': 0.19654093203091005, 'depth': 5, 'subsample': 0.8369584963585432, 'colsample_bylevel': 0.8893461550583386, 'min_child_samples': 4}
INFO:root:Training best models on full training data


[I 2025-12-02 17:29:54,077] Trial 49 finished with value: 8842.948266704387 and parameters: {'iterations': 161, 'learning_rate': 0.25260953947271864, 'depth': 4, 'subsample': 0.8190487749340074, 'colsample_bylevel': 0.8986161892330974, 'min_child_samples': 3}. Best is trial 47 with value: 8607.84617188617.


INFO:root:
INFO:root:Testing different ensemble strategies
INFO:root:
XGBoost only:
INFO:root:  XGB weight: 1.0, CatBoost weight: 0.0
INFO:root:  MAE: 8570.291
INFO:root:  R²: 0.6133
INFO:root:  RMSE: 15590.82
INFO:root:
CatBoost only:
INFO:root:  XGB weight: 0.0, CatBoost weight: 1.0
INFO:root:  MAE: 8592.763
INFO:root:  R²: 0.6254
INFO:root:  RMSE: 15344.42
INFO:root:
XGBoost dominant (0.7/0.3):
INFO:root:  XGB weight: 0.7, CatBoost weight: 0.3
INFO:root:  MAE: 8500.458
INFO:root:  R²: 0.6244
INFO:root:  RMSE: 15365.57
INFO:root:
Balanced (0.5/0.5):
INFO:root:  XGB weight: 0.5, CatBoost weight: 0.5
INFO:root:  MAE: 8492.625
INFO:root:  R²: 0.6282
INFO:root:  RMSE: 15286.72
INFO:root:
CatBoost dominant (0.3/0.7):
INFO:root:  XGB weight: 0.3, CatBoost weight: 0.7
INFO:root:  MAE: 8515.900
INFO:root:  R²: 0.6292
INFO:root:  RMSE: 15266.04
INFO:root:
INFO:root:Best strategy: Balanced (0.5/0.5) with MAE: 8492.625
INFO:root:
Ensemble comparison saved to 'ensemble_comparison.csv'



Final Results (Best Strategy: Balanced (0.5/0.5)):
MAE: 8492.625
R²: 0.6282
RMSE: 15286.72


In [11]:
#best
import logging
import pandas as pd
import numpy as np
import json
import zipfile
import xgboost as xgb # Import XGBoost
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np

def add_engineered_features(df):
    # Handle zero division
    df['guests_per_room'] = df['guests'] / df['rooms'].replace(0, np.nan)

    # Create a binary feature for "apartment" in name
    if "name" in df.columns:
        df["name"] = df["name"].astype(str).fillna("")
        df["is_studio"] = df["name"].str.lower().str.contains("studio|wifi|residence|cosy|courtyard|train").astype(int)
    
    # Generate interaction term between lat and lon
    if "lat" in df.columns and "lon" in df.columns:
        df["lat_lon_interaction"] = df["lat"] * df["lon"]
        city_center_lat, city_center_lon = df["lat"].mean(), df["lon"].mean()

    return df

def baseline():
    logging.info("Reading train and test files")
    train = pd.read_json("train.json", orient='records')
    test = pd.read_json("test.json", orient='records')
    train, valid = train_test_split(train, test_size=1/3, random_state=123)

    # Apply feature engineering
    train = add_engineered_features(train)
    valid = add_engineered_features(valid)
    test = add_engineered_features(test)

    preprocess = ColumnTransformer(
        transformers=[
        ("lat", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lat"]),
        ("lon", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lon"]),
        ("lat_lon_interaction", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lat_lon_interaction"]),
        ("min_nights", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["min_nights"]),
        ("num_reviews", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["num_reviews"]),
        ("guests", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["guests"]),
        ("cancellation", Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ["cancellation"]),
        ("guests_per_room", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["guests_per_room"]),
        ("rating", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["rating"]),
        ("bathrooms", Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]), ["bathrooms"]),
        ("listing_type", Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), ["listing_type"]),
        ("room_type", Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ]), ["room_type"]),
        ("is_studio", Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]), ["is_studio"]),
        
    ],
    remainder='drop'
    )

    dummy = make_pipeline(preprocess, DummyRegressor())
    xgb_model = make_pipeline(preprocess, xgb.XGBRegressor(
        n_estimators=285, learning_rate=0.1, max_depth=5, subsample=0.6, colsample_bytree=0.7,
        objective='reg:squarederror', random_state=123))

    label = 'revenue'
    for model_name, model in [("mean", dummy), ("xgboost", xgb_model)]:
        logging.info(f"Fitting model {model_name}")
        model.fit(train.drop([label], axis=1), np.log1p(train[label].values))

        for split_name, split in [("train", train), ("valid", valid)]:
            pred = np.expm1(model.predict(split.drop([label], axis=1)))
            mae = mean_absolute_error(split[label], pred)
            logging.info(f"{model_name} {split_name} {mae:.3f}")

    pred_test = np.expm1(xgb_model.predict(test))
    test[label] = pred_test
    predicted = test[['revenue']].to_dict(orient='records')

    with zipfile.ZipFile("baseline.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
        zipf.writestr("predicted.json", json.dumps(predicted, indent=2))

    r2 = r2_score(valid["revenue"], pred)
    rmse = mean_squared_error(valid["revenue"], pred)
    mae = mean_absolute_error(valid["revenue"], pred)

    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    baseline()

INFO:root:Reading train and test files


INFO:root:Fitting model mean
INFO:root:mean train 14798.557
INFO:root:mean valid 14397.861
INFO:root:Fitting model xgboost
INFO:root:xgboost train 7331.454
INFO:root:xgboost valid 8483.982


R² Score: 0.6323
RMSE: 231149520.00
MAE: 8483.98


In [17]:
import logging
import pandas as pd
import numpy as np
import json
import zipfile
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import optuna
from optuna.samplers import TPESampler

def add_engineered_features(df):
    # Handle zero division
    df['guests_per_room'] = df['guests'] / df['rooms'].replace(0, np.nan)

    # Create a binary feature for "apartment" in name
    if "name" in df.columns:
        df["name"] = df["name"].astype(str).fillna("")
        df["is_studio"] = df["name"].str.lower().str.contains("studio|wifi|residence|cosy|courtyard|train").astype(int)
    
    # Generate interaction term between lat and lon
    if "lat" in df.columns and "lon" in df.columns:
        df["lat_lon_interaction"] = df["lat"] * df["lon"]
        city_center_lat, city_center_lon = df["lat"].mean(), df["lon"].mean()

    return df

def baseline():
    logging.info("Reading train and test files")
    train = pd.read_json("train.json", orient='records')
    test = pd.read_json("test.json", orient='records')
    train, valid = train_test_split(train, test_size=1/3, random_state=123)

    # Apply feature engineering
    train = add_engineered_features(train)
    valid = add_engineered_features(valid)
    test = add_engineered_features(test)

    preprocess = ColumnTransformer(
        transformers=[
        ("lat", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lat"]),
        ("lon", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lon"]),
        ("lat_lon_interaction", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["lat_lon_interaction"]),
        ("min_nights", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["min_nights"]),
        ("num_reviews", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["num_reviews"]),
        ("guests", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["guests"]),
        ("cancellation", Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ["cancellation"]),
        ("guests_per_room", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["guests_per_room"]),
        ("rating", Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))]), ["rating"]),
        ("bathrooms", Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]), ["bathrooms"]),
        ("listing_type", Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), ["listing_type"]),
        ("room_type", Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ]), ["room_type"]),
        ("is_studio", Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]), ["is_studio"]),
        
    ],
    remainder='drop'
    )

    label = 'revenue'
    
    # Train best XGBoost model (with your optimal parameters)
    logging.info("Training XGBoost with best parameters")
    xgb_model = make_pipeline(preprocess, xgb.XGBRegressor(
        n_estimators=285, learning_rate=0.1, max_depth=5, subsample=0.6, colsample_bytree=0.7,
        objective='reg:squarederror', random_state=123))
    
    xgb_model.fit(train.drop([label], axis=1), np.log1p(train[label].values))
    
    xgb_pred_train = np.expm1(xgb_model.predict(train.drop([label], axis=1)))
    xgb_pred_valid = np.expm1(xgb_model.predict(valid.drop([label], axis=1)))
    
    xgb_mae_train = mean_absolute_error(train[label], xgb_pred_train)
    xgb_mae_valid = mean_absolute_error(valid[label], xgb_pred_valid)
    logging.info(f"XGBoost train MAE: {xgb_mae_train:.3f}")
    logging.info(f"XGBoost valid MAE: {xgb_mae_valid:.3f}")
    
    # Preprocess data for CatBoost tuning
    X_train = preprocess.transform(train.drop([label], axis=1))
    y_train = np.log1p(train[label].values)
    X_valid = preprocess.transform(valid.drop([label], axis=1))
    y_valid = np.log1p(valid[label].values)
    
    # Optuna hyperparameter tuning for CatBoost
    logging.info("\nStarting Optuna tuning for CatBoost")
    
    def objective_catboost(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 100, 500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'depth': trial.suggest_int('depth', 4, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 10),
            'random_state': 123,
            'verbose': False
        }
        
        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=False)
        
        pred = np.expm1(model.predict(X_valid))
        mae = mean_absolute_error(valid[label], pred)
        return mae
    
    study_catboost = optuna.create_study(direction='minimize', sampler=TPESampler(seed=123))
    study_catboost.optimize(objective_catboost, n_trials=50, show_progress_bar=True)
    
    logging.info(f"\nBest CatBoost MAE: {study_catboost.best_value:.3f}")
    logging.info(f"Best CatBoost params: {study_catboost.best_params}")
    
    # Train best CatBoost model
    logging.info("\nTraining CatBoost with best parameters")
    best_catboost_params = study_catboost.best_params.copy()
    best_catboost_params.update({'random_state': 123, 'verbose': False})
    
    catboost_model = make_pipeline(preprocess, CatBoostRegressor(**best_catboost_params))
    catboost_model.fit(train.drop([label], axis=1), np.log1p(train[label].values))
    
    catboost_pred_train = np.expm1(catboost_model.predict(train.drop([label], axis=1)))
    catboost_pred_valid = np.expm1(catboost_model.predict(valid.drop([label], axis=1)))
    
    catboost_mae_train = mean_absolute_error(train[label], catboost_pred_train)
    catboost_mae_valid = mean_absolute_error(valid[label], catboost_pred_valid)
    logging.info(f"CatBoost train MAE: {catboost_mae_train:.3f}")
    logging.info(f"CatBoost valid MAE: {catboost_mae_valid:.3f}")
    
    # Test different ensemble weights
    logging.info("\n" + "="*60)
    logging.info("Testing different ensemble strategies")
    logging.info("="*60)
    
    ensemble_strategies = {
        'XGBoost only': (1.0, 0.0),
        'CatBoost only': (0.0, 1.0),
        'XGBoost dominant (0.7/0.3)': (0.7, 0.3),
        'Balanced (0.5/0.5)': (0.5, 0.5),
        'CatBoost dominant (0.3/0.7)': (0.3, 0.7)
    }
    
    results = []
    best_strategy = None
    best_mae = float('inf')
    
    for strategy_name, (xgb_weight, catboost_weight) in ensemble_strategies.items():
        ensemble_pred_valid = xgb_weight * xgb_pred_valid + catboost_weight * catboost_pred_valid
        
        mae = mean_absolute_error(valid[label], ensemble_pred_valid)
        r2 = r2_score(valid[label], ensemble_pred_valid)
        rmse = np.sqrt(mean_squared_error(valid[label], ensemble_pred_valid))
        
        results.append({
            'strategy': strategy_name,
            'xgb_weight': xgb_weight,
            'catboost_weight': catboost_weight,
            'mae': mae,
            'r2': r2,
            'rmse': rmse
        })
        
        logging.info(f"\n{strategy_name}:")
        logging.info(f"  XGB weight: {xgb_weight:.1f}, CatBoost weight: {catboost_weight:.1f}")
        logging.info(f"  MAE: {mae:.3f}")
        logging.info(f"  R²: {r2:.4f}")
        logging.info(f"  RMSE: {rmse:.2f}")
        
        if mae < best_mae:
            best_mae = mae
            best_strategy = strategy_name
            best_weights = (xgb_weight, catboost_weight)
    
    logging.info("\n" + "="*60)
    logging.info(f"Best strategy: {best_strategy} with MAE: {best_mae:.3f}")
    logging.info("="*60)
    
    # Use best ensemble for final predictions
    best_xgb_weight, best_catboost_weight = best_weights
    xgb_pred_test = np.expm1(xgb_model.predict(test))
    catboost_pred_test = np.expm1(catboost_model.predict(test))
    final_pred_test = best_xgb_weight * xgb_pred_test + best_catboost_weight * catboost_pred_test
    
    test[label] = final_pred_test
    predicted = test[['revenue']].to_dict(orient='records')

    with zipfile.ZipFile("baseline.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
        zipf.writestr("predicted.json", json.dumps(predicted, indent=2))
    
    # Save results summary
    results_df = pd.DataFrame(results)
    results_df.to_csv("ensemble_comparison.csv", index=False)
    logging.info("\nEnsemble comparison saved to 'ensemble_comparison.csv'")
    
    print(f"\n{'='*60}")
    print(f"Final Results (Best Strategy: {best_strategy})")
    print(f"{'='*60}")
    print(f"MAE: {best_mae:.3f}")
    print(f"R²: {results_df[results_df['strategy'] == best_strategy]['r2'].values[0]:.4f}")
    print(f"RMSE: {results_df[results_df['strategy'] == best_strategy]['rmse'].values[0]:.2f}")

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    baseline()

INFO:root:Reading train and test files
INFO:root:Training XGBoost with best parameters
INFO:root:XGBoost train MAE: 7331.454
INFO:root:XGBoost valid MAE: 8483.982
INFO:root:
Starting Optuna tuning for CatBoost
[I 2025-12-02 17:36:24,034] A new study created in memory with name: no-name-ba0802d9-ae84-44c1-94c3-6d361ed1d993


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-02 17:36:27,535] Trial 0 finished with value: 9341.91245836899 and parameters: {'iterations': 379, 'learning_rate': 0.02646442689360642, 'depth': 5, 'subsample': 0.7756573845414456, 'colsample_bylevel': 0.8597344848927815, 'min_child_samples': 5}. Best is trial 0 with value: 9341.91245836899.
[I 2025-12-02 17:36:33,991] Trial 1 finished with value: 8702.730810960478 and parameters: {'iterations': 493, 'learning_rate': 0.10270144703457969, 'depth': 7, 'subsample': 0.6960587590970753, 'colsample_bylevel': 0.6715890080754348, 'min_child_samples': 8}. Best is trial 1 with value: 8702.730810960478.
[I 2025-12-02 17:36:37,277] Trial 2 finished with value: 10304.853250374787 and parameters: {'iterations': 275, 'learning_rate': 0.012250434413464737, 'depth': 6, 'subsample': 0.8689977028660179, 'colsample_bylevel': 0.59124586522675, 'min_child_samples': 2}. Best is trial 1 with value: 8702.730810960478.
[I 2025-12-02 17:36:42,695] Trial 3 finished with value: 8730.465542687189 and pa

INFO:root:
Best CatBoost MAE: 8607.846
INFO:root:Best CatBoost params: {'iterations': 477, 'learning_rate': 0.19654093203091005, 'depth': 5, 'subsample': 0.8369584963585432, 'colsample_bylevel': 0.8893461550583386, 'min_child_samples': 4}
INFO:root:
Training CatBoost with best parameters


[I 2025-12-02 17:40:49,968] Trial 49 finished with value: 8842.948266704387 and parameters: {'iterations': 161, 'learning_rate': 0.25260953947271864, 'depth': 4, 'subsample': 0.8190487749340074, 'colsample_bylevel': 0.8986161892330974, 'min_child_samples': 3}. Best is trial 47 with value: 8607.84617188617.


INFO:root:CatBoost train MAE: 7644.097
INFO:root:CatBoost valid MAE: 8592.763
INFO:root:
INFO:root:Testing different ensemble strategies
INFO:root:
XGBoost only:
INFO:root:  XGB weight: 1.0, CatBoost weight: 0.0
INFO:root:  MAE: 8483.982
INFO:root:  R²: 0.6323
INFO:root:  RMSE: 15203.60
INFO:root:
CatBoost only:
INFO:root:  XGB weight: 0.0, CatBoost weight: 1.0
INFO:root:  MAE: 8592.763
INFO:root:  R²: 0.6254
INFO:root:  RMSE: 15344.42
INFO:root:
XGBoost dominant (0.7/0.3):
INFO:root:  XGB weight: 0.7, CatBoost weight: 0.3
INFO:root:  MAE: 8435.298
INFO:root:  R²: 0.6372
INFO:root:  RMSE: 15101.33
INFO:root:
Balanced (0.5/0.5):
INFO:root:  XGB weight: 0.5, CatBoost weight: 0.5
INFO:root:  MAE: 8443.511
INFO:root:  R²: 0.6372
INFO:root:  RMSE: 15102.13
INFO:root:
CatBoost dominant (0.3/0.7):
INFO:root:  XGB weight: 0.3, CatBoost weight: 0.7
INFO:root:  MAE: 8482.263
INFO:root:  R²: 0.6345
INFO:root:  RMSE: 15158.19
INFO:root:
INFO:root:Best strategy: XGBoost dominant (0.7/0.3) with MAE:


Final Results (Best Strategy: XGBoost dominant (0.7/0.3))
MAE: 8435.298
R²: 0.6372
RMSE: 15101.33
