In [45]:
import pandas as pd
import numpy as np
import statsmodels.api as sm


In [46]:
cph = pd.read_csv("copenhagen_imputed.csv")
oslo = pd.read_csv("oslo_imputed.csv")

# Safety check: ensure city column exists
cph["city"] = "Copenhagen"
oslo["city"] = "Oslo"

df = pd.concat([cph, oslo], axis=0, ignore_index=True)


In [47]:
df["log_price"] = np.log(df["price_local"])
df["has_reviews"] = df["has_reviews"].astype(int)


bool_map = {
    "t": 1,
    "f": 0,
}

binary_cols = [
    "host_is_superhost",
    "host_identity_verified",
    "instant_bookable",
    "has_availability",
    "has_reviews"
]

for col in binary_cols:
    df[col] = df[col].map(bool_map)

In [48]:
categorical_cols = [
    "room_type",
    "property_type_grouped",
    "property_privacy_level"
]

df = pd.get_dummies(
    df,
    columns=categorical_cols,
    drop_first=True,
    dtype=int
)


In [49]:
MODELS = {
    "structural": [
        "accommodates",
        "bedrooms",
        "bathrooms",
        "beds"
    ],

    "distance": [
        "accommodates",
        "bedrooms",
        "bathrooms",
        "beds",
        "distance_to_center_km"
    ],

    "listing_type": [
        "accommodates",
        "bedrooms",
        "bathrooms",
        "beds",
        "distance_to_center_km",
        "room_type_Private room",
        "room_type_Shared room"
    ],

    "host_features": [
        "accommodates",
        "bedrooms",
        "bathrooms",
        "beds",
        "distance_to_center_km",
        "room_type_Private room",
        "room_type_Shared room",
        "host_is_superhost",
        "host_total_listings_count",
        "calculated_host_listings_count"
    ],

    "availability_demand": [
        "accommodates",
        "bedrooms",
        "bathrooms",
        "beds",
        "distance_to_center_km",
        "room_type_Private room",
        "room_type_Shared room",
        "host_is_superhost",
        "host_total_listings_count",
        "calculated_host_listings_count",
        "availability_30",
        "availability_60",
        "estimated_occupancy_l365d"
    ],

    
     "Lasso CPH": [
        "estimated_revenue_l365d",
        "estimated_occupancy_l365d",
        "minimum_minimum_nights"

    ],

    "Lasso Oslo": [
        "maximum_maximum_nights",
        "maximum_nights_avg_ntm",
        "minimum_maximum_nights",
        "host_total_listings_count"

    ]
}


In [50]:
def fit_bic(df, features):
    X = df[features]

    X = sm.add_constant(X)
    y = df["log_price"]

    model = sm.OLS(y, X, missing="drop").fit()
    return model.bic, model


In [51]:
def bic_table_city(df, city):
    df_city = df[df["city"] == city]
    rows = []

    for name, features in MODELS.items():
        bic, model = fit_bic(df_city, features)
        rows.append({
            "city": city,
            "model": name,
            "n_params": len(model.params),
            "bic": bic
        })

    return pd.DataFrame(rows).sort_values("bic")


In [52]:
bic_cph = bic_table_city(df, "Copenhagen")
bic_oslo = bic_table_city(df, "Oslo")

print("Copenhagen")
print(bic_cph)

print("\nOslo")
print(bic_oslo)


Copenhagen
         city                model  n_params           bic
4  Copenhagen  availability_demand        14   6080.578162
3  Copenhagen        host_features        11   8024.225275
2  Copenhagen         listing_type         8   8284.085028
1  Copenhagen             distance         6  10068.665082
0  Copenhagen           structural         5  12127.122847
5  Copenhagen            Lasso CPH         4  13392.334943
6  Copenhagen           Lasso Oslo         5  16558.909592

Oslo
   city                model  n_params           bic
4  Oslo  availability_demand        14   5342.955679
3  Oslo        host_features        11   6037.417605
2  Oslo         listing_type         8   6272.788284
1  Oslo             distance         6   7298.184006
0  Oslo           structural         5   7815.009736
5  Oslo            Lasso CPH         4   9115.343725
6  Oslo           Lasso Oslo         5  11230.754164


In [53]:
best_cph_model = bic_cph.iloc[0]["model"]
best_oslo_model = bic_oslo.iloc[0]["model"]

print("Best CPH model:", best_cph_model)
print("Best Oslo model:", best_oslo_model)



Best CPH model: availability_demand
Best Oslo model: availability_demand


In [54]:
_, model_cph = fit_bic(
    df[df["city"] == "Copenhagen"],
    MODELS[best_cph_model]
)

print(model_cph.summary())


                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.614
Model:                            OLS   Adj. R-squared:                  0.614
Method:                 Least Squares   F-statistic:                     1408.
Date:                Sun, 28 Dec 2025   Prob (F-statistic):               0.00
Time:                        21:27:57   Log-Likelihood:                -2974.8
No. Observations:               11503   AIC:                             5978.
Df Residuals:                   11489   BIC:                             6081.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

In [55]:
_, model_oslo = fit_bic(
    df[df["city"] == "Oslo"],
    MODELS[best_oslo_model]
)

print(model_oslo.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.592
Model:                            OLS   Adj. R-squared:                  0.591
Method:                 Least Squares   F-statistic:                     771.6
Date:                Sun, 28 Dec 2025   Prob (F-statistic):               0.00
Time:                        21:31:21   Log-Likelihood:                -2609.6
No. Observations:                6930   AIC:                             5247.
Df Residuals:                    6916   BIC:                             5343.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       