In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler

# 1. LOAD AND PREPARE DATA
df = pd.read_csv("merged.csv", low_memory=False)

# Filter for the two cities only
df_sub = df[df['city'].isin(['Copenhagen', 'Oslo'])].copy()

# 2. FEATURE CLEANING
# Keep numeric features, drop target-related columns
features = df_sub.select_dtypes(include=[np.number]).columns.tolist()
if 'price_local' in features: features.remove('price_local')
if 'id' in features: features.remove('id')

X = df_sub[features].apply(pd.to_numeric, errors='coerce')
X = X.fillna(X.median()).dropna(axis=1, how='all')

# Target: 1 for Oslo, 0 for Copenhagen
y = (df_sub['city'] == 'Oslo').astype(int)

# 3. SCALING (Crucial for Logistic Regression and L1 penalty)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. LOGISTIC REGRESSION WITH L1 PENALTY (Feature Selection)
# We use 'liblinear' solver because it supports L1
clf = LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear', random_state=42)
clf.fit(X_scaled, y)

# 5. RESULTS: EXTRACT COEFFICIENTS
coefs = pd.Series(clf.coef_[0], index=X.columns).sort_values()

print("\n--- FEATURES DISTINGUISHING THE MARKETS ---")
print("\nTop Features identifying COPENHAGEN (Negative Coefficients):")
print(coefs.head(10))

print("\nTop Features identifying OSLO (Positive Coefficients):")
print(coefs.tail(10))

print(f"\nModel Accuracy (How different are these cities structurally?): {clf.scores_[1].mean():.2%}")


--- FEATURES DISTINGUISHING THE MARKETS ---

Top Features identifying COPENHAGEN (Negative Coefficients):
host_id                        0.0
number_of_reviews_ltm          0.0
number_of_reviews_l30d         0.0
number_of_reviews_ly           0.0
estimated_occupancy_l365d      0.0
estimated_revenue_l365d        0.0
review_scores_rating           0.0
review_scores_cleanliness      0.0
review_scores_checkin          0.0
review_scores_communication    0.0
dtype: float64

Top Features identifying OSLO (Positive Coefficients):
maximum_nights            0.00000
maximum_minimum_nights    0.00000
minimum_maximum_nights    0.00000
maximum_maximum_nights    0.00000
minimum_nights_avg_ntm    0.00000
maximum_nights_avg_ntm    0.00000
availability_30           0.00000
minimum_minimum_nights    0.00000
distance_to_center_km     0.00000
latitude                  0.81537
dtype: float64

Model Accuracy (How different are these cities structurally?): 100.00%


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler

# 1. LOAD AND PREPARE DATA
df = pd.read_csv("merged.csv", low_memory=False)

# Filter for the two cities only
df_sub = df[df['city'].isin(['Copenhagen', 'Oslo'])].copy()

# 2. REFINED FEATURE SELECTION
# We drop geography-based features because they are "trivial" predictors of city
drop_from_features = [
    'price_local', 'latitude', 'longitude', 'id'
]

features = df_sub.select_dtypes(include=[np.number]).columns.tolist()
features = [f for f in features if f not in drop_from_features]

X = df_sub[features].apply(pd.to_numeric, errors='coerce')
X = X.fillna(X.median()).dropna(axis=1, how='all')

# Target: 1 for Oslo, 0 for Copenhagen
y = (df_sub['city'] == 'Oslo').astype(int)

# 3. SCALING
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. LOGISTIC REGRESSION (L1 penalty for clean feature selection)
clf = LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear', random_state=42)
clf.fit(X_scaled, y)

# 5. RESULTS
coefs = pd.Series(clf.coef_[0], index=X.columns).sort_values()

print("\n--- STRUCTURAL MARKET DISTINCTIONS (Non-Geographic) ---")
print("\nTop 5 Features identifying COPENHAGEN:")
print(coefs.head(5))

print("\nTop 5 Features identifying OSLO:")
print(coefs.tail(5))

print(f"\nModel Accuracy (Structural Difference): {clf.scores_[1].mean():.2%}")


--- STRUCTURAL MARKET DISTINCTIONS (Non-Geographic) ---

Top 5 Features identifying COPENHAGEN:
availability_30             -1.006017
number_of_reviews_l30d      -0.425385
number_of_reviews_ly        -0.174181
review_scores_checkin       -0.156301
host_total_listings_count   -0.104368
dtype: float64

Top 5 Features identifying OSLO:
accommodates           0.068164
review_scores_value    0.330995
availability_90        0.626843
reviews_per_month      0.676428
availability_60        0.885887
dtype: float64

Model Accuracy (Structural Difference): 71.19%
