In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

In [23]:
cph = pd.read_csv("copenhagen_imputed.csv")
oslo = pd.read_csv("oslo_imputed.csv")

# Safety check: ensure city column exists
cph["city"] = "Copenhagen"
oslo["city"] = "Oslo"

df = pd.concat([cph, oslo], axis=0, ignore_index=True)


In [None]:
#Data Preparation

#Binary data
binary_cols = [
    "host_is_superhost", "host_identity_verified",
    "has_availability", "instant_bookable", "has_reviews"
]

df[binary_cols] = df[binary_cols].replace({"t": 1, "f": 0})

#Removing %
pct_cols = ["host_response_rate", "host_acceptance_rate"]

for c in pct_cols:
    df[c] = (
        df[c]
        .str.replace("%", "", regex=False)
        .astype(float)
    )

#Response time encoded as ordered values
response_map = {
    "within an hour": 4,
    "within a few hours": 3,
    "within a day": 2,
    "a few days or more": 1
}

df["host_response_time"] = df["host_response_time"].map(response_map)

categorical_cols = [
    "room_type",
    "property_type_grouped",
    "property_privacy_level",
    "neighbourhood_cleansed",
    "bathrooms_text",
    "property_type",
    "property_category"
]

df = pd.get_dummies(
    df,
    columns=categorical_cols,
    drop_first=True,
    dtype=int
)


  df[binary_cols] = df[binary_cols].replace({"t": 1, "f": 0})


In [38]:
df.to_csv("model_input_merged.csv", index=False)

In [26]:
df_oslo = df[df['city'] == 'Oslo']
df_cph  = df[df['city'] == 'Copenhagen']

In [28]:
def run_models(df_city):
    X = df_city.drop(columns=["price_local", "city"])
    y = df_city["price_local"]

    ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("model", RidgeCV(alphas=np.logspace(-3, 3, 50), cv=5))
    ])

    lasso = Pipeline([
        ("scaler", StandardScaler()),
        ("model", LassoCV(alphas=np.logspace(-3, 1, 50), cv=5, max_iter=20000))
    ])

    ridge.fit(X, y)
    lasso.fit(X, y)

    return ridge, lasso, X.columns

ridge_oslo, lasso_oslo, feat_oslo = run_models(df_oslo)
ridge_cph,  lasso_cph,  feat_cph  = run_models(df_cph)

In [33]:
print("Oslo:")
print("Ridge non-zero:",np.sum(ridge_oslo.named_steps["model"].coef_ != 0))
print("Lasso non-zero:",np.sum(lasso_oslo.named_steps["model"].coef_ != 0))

print("Copenhagen:")
print("Ridge non-zero:",np.sum(ridge_cph.named_steps["model"].coef_ != 0))
print("Lasso non-zero:", np.sum(lasso_cph.named_steps["model"].coef_ != 0))

Oslo:
Ridge non-zero: 99
Lasso non-zero: 61
Copenhagen:
Ridge non-zero: 95
Lasso non-zero: 46


In [35]:
lasso_oslo_coef = pd.Series(lasso_oslo.named_steps["model"].coef_, index=feat_oslo)

lasso_oslo_selected = (lasso_oslo_coef[lasso_oslo_coef != 0].sort_values(key=np.abs, ascending=False))

lasso_oslo_selected.head(30)

host_listings_count                             5413.259560
host_total_listings_count                      -2474.640746
calculated_host_listings_count                 -2284.786031
estimated_revenue_l365d                          803.695025
estimated_occupancy_l365d                       -699.548507
minimum_maximum_nights                          -400.224724
maximum_maximum_nights                           257.676619
bathrooms_text_4.5 baths                         238.898227
bedrooms                                         183.781339
property_category_Special                        139.704157
bathrooms_text_2 baths                           124.022502
bathrooms_text_1.5 shared baths                  121.991468
maximum_nights_avg_ntm                           118.400771
bathrooms_text_2.5 baths                         116.481548
has_reviews                                     -101.191141
distance_to_center_km                            -91.798860
availability_30                         

In [36]:
lasso_cph_coef = pd.Series(lasso_cph.named_steps["model"].coef_,index=feat_cph)

lasso_cph_selected = (lasso_cph_coef[lasso_cph_coef != 0].sort_values(key=np.abs, ascending=False))

lasso_cph_selected.head(30)

estimated_revenue_l365d                        707.935856
estimated_occupancy_l365d                     -498.012756
minimum_maximum_nights                         179.443304
maximum_maximum_nights                        -149.097876
bedrooms                                       143.846540
host_listings_count                            110.534446
accommodates                                    96.166426
availability_90                                 91.072150
distance_to_center_km                          -87.281942
calculated_host_listings_count                  85.588149
bathrooms                                       72.810136
availability_60                                 69.293656
neighbourhood_cleansed_Indre By                 67.551204
property_type_grouped_Hotel                    -46.169651
neighbourhood_cleansed_Østerbro                 39.102481
review_scores_cleanliness                       38.590529
bathrooms_text_4 baths                          37.886317
host_acceptanc

In [37]:
oslo_feats = set(lasso_oslo_selected.index)
cph_feats  = set(lasso_cph_selected.index)

common = oslo_feats & cph_feats
only_oslo = oslo_feats - cph_feats
only_cph  = cph_feats - oslo_feats

print("Common features:", common)
print("Only Oslo:", only_oslo)
print("Only Copenhagen:", only_cph)


Common features: {'estimated_revenue_l365d', 'estimated_occupancy_l365d', 'property_type_grouped_Hotel', 'host_response_rate', 'review_scores_cleanliness', 'availability_365', 'host_acceptance_rate', 'maximum_maximum_nights', 'maximum_minimum_nights', 'property_category_Hotel', 'host_identity_verified', 'bathrooms_text_4 shared baths', 'bathrooms', 'bathrooms_text_2 baths', 'accommodates', 'calculated_host_listings_count', 'bathrooms_text_3 baths', 'bedrooms', 'distance_to_center_km', 'host_is_superhost', 'bathrooms_text_2.5 baths', 'minimum_maximum_nights', 'maximum_nights', 'host_listings_count', 'property_type_Hotel', 'room_type_Shared room'}
Only Oslo: {'neighbourhood_cleansed_Grünerløkka', 'beds', 'neighbourhood_cleansed_Nordstrand', 'property_category_Special', 'bathrooms_text_3.5 baths', 'number_of_reviews', 'property_type_Private Standard', 'maximum_nights_avg_ntm', 'neighbourhood_cleansed_Grorud', 'neighbourhood_cleansed_Ullern', 'neighbourhood_cleansed_Bjerke', 'review_scores

In [41]:
def coef_table(lasso_model, ridge_model, feature_names, city):
    return pd.DataFrame({
        "feature": feature_names,
        "city": city,
        "lasso_coef": lasso_model.named_steps["model"].coef_,
        "ridge_coef": ridge_model.named_steps["model"].coef_,
    })

# Oslo coefficients
coef_oslo = coef_table(
    lasso_oslo,
    ridge_oslo,
    feat_oslo,
    "Oslo"
)

# Copenhagen coefficients
coef_cph = coef_table(
    lasso_cph,
    ridge_cph,
    feat_cph,
    "Copenhagen"
)

# Combine into one DataFrame
coef_all = pd.concat([coef_oslo, coef_cph], ignore_index=True)

In [43]:
coef_all.to_csv("lasso_ridge_results.csv", index=False)