In [44]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

In [45]:
cph = pd.read_csv("copenhagen_imputed.csv")
oslo = pd.read_csv("oslo_imputed.csv")

# Safety check: ensure city column exists
cph["city"] = "Copenhagen"
oslo["city"] = "Oslo"

df = pd.concat([cph, oslo], axis=0, ignore_index=True)


In [46]:
#Data Preparation

#Binary data
binary_cols = [
    "host_is_superhost", "host_identity_verified",
    "has_availability", "instant_bookable", "has_reviews"
]

df[binary_cols] = df[binary_cols].replace({"t": 1, "f": 0})

#Removing %
pct_cols = ["host_response_rate", "host_acceptance_rate"]

for c in pct_cols:
    df[c] = (
        df[c]
        .str.replace("%", "", regex=False)
        .astype(float)
    )

#Response time encoded as ordered values
response_map = {
    "within an hour": 4,
    "within a few hours": 3,
    "within a day": 2,
    "a few days or more": 1
}

df["host_response_time"] = df["host_response_time"].map(response_map)

categorical_cols = [
    "room_type",
    "property_type_grouped",
    "property_privacy_level",
    "neighbourhood_cleansed",
    "bathrooms_text",
    "property_type",
    "property_category"
]

df = pd.get_dummies(
    df,
    columns=categorical_cols,
    drop_first=True,
    dtype=int
)


  df[binary_cols] = df[binary_cols].replace({"t": 1, "f": 0})


In [38]:
df.to_csv("model_input_merged.csv", index=False)

In [47]:
df_oslo = df[df['city'] == 'Oslo']
df_cph  = df[df['city'] == 'Copenhagen']

In [48]:
def run_models(df_city):
    X = df_city.drop(columns=["price_local", "city"])
    y = np.log(df_city["price_local"])

    ridge = Pipeline([
        ("scaler", StandardScaler()),
        ("model", RidgeCV(alphas=np.logspace(-3, 3, 50), cv=5))
    ])

    lasso = Pipeline([
        ("scaler", StandardScaler()),
        ("model", LassoCV(alphas=np.logspace(-3, 1, 50), cv=5, max_iter=20000))
    ])

    ridge.fit(X, y)
    lasso.fit(X, y)

    return ridge, lasso, X.columns

ridge_oslo, lasso_oslo, feat_oslo = run_models(df_oslo)
ridge_cph,  lasso_cph,  feat_cph  = run_models(df_cph)

In [49]:
print("Oslo:")
print("Ridge non-zero:",np.sum(ridge_oslo.named_steps["model"].coef_ != 0))
print("Lasso non-zero:",np.sum(lasso_oslo.named_steps["model"].coef_ != 0))

print("Copenhagen:")
print("Ridge non-zero:",np.sum(ridge_cph.named_steps["model"].coef_ != 0))
print("Lasso non-zero:", np.sum(lasso_cph.named_steps["model"].coef_ != 0))

Oslo:
Ridge non-zero: 99
Lasso non-zero: 74
Copenhagen:
Ridge non-zero: 95
Lasso non-zero: 73


In [50]:
lasso_oslo_coef = pd.Series(lasso_oslo.named_steps["model"].coef_, index=feat_oslo)

lasso_oslo_selected = (lasso_oslo_coef[lasso_oslo_coef != 0].sort_values(key=np.abs, ascending=False))

lasso_oslo_selected.head(30)

host_listings_count                             0.211189
estimated_occupancy_l365d                      -0.201713
estimated_revenue_l365d                         0.186291
room_type_Private room                         -0.147566
bedrooms                                        0.122292
accommodates                                    0.122195
distance_to_center_km                          -0.104390
calculated_host_listings_count                 -0.095093
host_total_listings_count                      -0.091005
availability_30                                 0.076219
minimum_maximum_nights                         -0.076090
maximum_maximum_nights                          0.066000
room_type_Shared room                          -0.057949
neighbourhood_cleansed_Frogner                  0.037871
availability_365                                0.037745
bathrooms_text_2 baths                          0.035188
number_of_reviews_l30d                         -0.033742
host_response_rate             

In [51]:
lasso_cph_coef = pd.Series(lasso_cph.named_steps["model"].coef_,index=feat_cph)

lasso_cph_selected = (lasso_cph_coef[lasso_cph_coef != 0].sort_values(key=np.abs, ascending=False))

lasso_cph_selected.head(30)

host_total_listings_count               -0.199463
host_listings_count                      0.196844
estimated_revenue_l365d                  0.137205
estimated_occupancy_l365d               -0.123676
accommodates                             0.108347
bedrooms                                 0.104954
room_type_Private room                  -0.089970
distance_to_center_km                   -0.085969
availability_90                          0.073938
calculated_host_listings_count           0.068090
neighbourhood_cleansed_Indre By          0.062859
availability_60                          0.051277
room_type_Shared room                   -0.041945
property_type_grouped_Hotel             -0.039986
host_acceptance_rate                     0.029782
neighbourhood_cleansed_Østerbro          0.028737
host_response_rate                       0.025924
review_scores_cleanliness                0.024918
review_scores_rating                     0.023393
bathrooms_text_2 baths                   0.022521


In [52]:
oslo_feats = set(lasso_oslo_selected.index)
cph_feats  = set(lasso_cph_selected.index)

common = oslo_feats & cph_feats
only_oslo = oslo_feats - cph_feats
only_cph  = cph_feats - oslo_feats

print("Common features:", common)
print("Only Oslo:", only_oslo)
print("Only Copenhagen:", only_cph)


Common features: {'estimated_revenue_l365d', 'estimated_occupancy_l365d', 'bathrooms_text_4 baths', 'host_response_rate', 'bathrooms_text_1 private bath', 'bathrooms_text_3.5 baths', 'property_category_Special', 'number_of_reviews_ltm', 'review_scores_cleanliness', 'availability_365', 'property_type_Private Standard', 'host_acceptance_rate', 'review_scores_rating', 'review_scores_value', 'minimum_minimum_nights', 'bathrooms_text_4 shared baths', 'bathrooms', 'property_type_grouped_Entire Luxury', 'bathrooms_text_2 baths', 'calculated_host_listings_count_shared_rooms', 'accommodates', 'calculated_host_listings_count', 'bathrooms_text_3 baths', 'bathrooms_text_1.5 shared baths', 'bathrooms_text_3 shared baths', 'bedrooms', 'review_scores_checkin', 'distance_to_center_km', 'bathrooms_text_Private half-bath', 'has_reviews', 'bathrooms_text_Half-bath', 'host_is_superhost', 'property_type_grouped_Private Standard', 'host_total_listings_count', 'bathrooms_text_2.5 baths', 'minimum_maximum_nig

In [56]:
lasso_model_cph = lasso_cph.named_steps["model"]

# Get coefficients
coef_cph = pd.Series(
    lasso_model_cph.coef_,
    index=feat_cph
)
lasso_cph_zero = coef_cph[coef_cph == 0]
print("\nFeatures set to zero for Copenhagen:")
print(lasso_cph_zero.index.tolist())


Features set to zero for Copenhagen:
['host_response_time', 'beds', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'availability_30', 'number_of_reviews_l30d', 'number_of_reviews_ly', 'calculated_host_listings_count_private_rooms', 'property_type_grouped_Entire Standard', 'property_privacy_level_Entire', 'property_privacy_level_Private', 'neighbourhood_cleansed_Amager Øst', 'neighbourhood_cleansed_Bjerke', 'neighbourhood_cleansed_Frogner', 'neighbourhood_cleansed_Gamle Oslo', 'neighbourhood_cleansed_Grorud', 'neighbourhood_cleansed_Grünerløkka', 'neighbourhood_cleansed_Marka', 'neighbourhood_cleansed_Nordre Aker', 'neighbourhood_cleansed_Nordstrand', 'neighbourhood_cleansed_Sagene', 'neighbourhood_cleansed_Sentrum', 'neighbourhood_cleansed_St. Hanshaugen', 'neighbourhood_cleansed_Stovner', 'neighbourhood_cleansed_Søndre Nordstrand', 'neighbourhood_cleansed_Ullern', 'neighbourhood_cleansed_Vesterbro-Kongens Enghave', 'neighbourhood_cle

In [53]:
def coef_table(lasso_model, ridge_model, feature_names, city):
    return pd.DataFrame({
        "feature": feature_names,
        "city": city,
        "lasso_coef": lasso_model.named_steps["model"].coef_,
        "ridge_coef": ridge_model.named_steps["model"].coef_,
    })

# Oslo coefficients
coef_oslo = coef_table(
    lasso_oslo,
    ridge_oslo,
    feat_oslo,
    "Oslo"
)

# Copenhagen coefficients
coef_cph = coef_table(
    lasso_cph,
    ridge_cph,
    feat_cph,
    "Copenhagen"
)

# Combine into one DataFrame
coef_all = pd.concat([coef_oslo, coef_cph], ignore_index=True)

In [54]:
coef_all.to_csv("lasso_ridge_results.csv", index=False)