In [1]:
"""
K-Means Clustering for Airbnb Price Segmentation
"""
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

In [2]:
# Load processed data
train = pd.read_csv('airbnb_train_processed.csv')
test = pd.read_csv('airbnb_test_processed.csv')

In [3]:
X = train.drop('price', axis=1)
prices = train['price']

In [4]:
# Cluster into 5 price segments
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

In [5]:
# Attach cluster labels
train['price_cluster'] = clusters

In [6]:
# Show mean price per cluster
cluster_means = train.groupby('price_cluster')['price'].mean()
print('K-Means Price Segments:')
print(cluster_means)

K-Means Price Segments:
price_cluster
0    458.588710
1    190.500819
2    305.631211
3    161.912121
4    180.196145
Name: price, dtype: float64


In [7]:
# Prepare test features (drop price column if present)
X_test = test.drop('price', axis=1, errors='ignore')

# Predict clusters for test data
test_clusters = kmeans.predict(X_test)

# Attach cluster labels to the test DataFrame
test['price_cluster'] = test_clusters

test['predicted_price'] = [cluster_means[cluster] for cluster in test_clusters]

In [8]:
# Calculate R² Score
r2 = r2_score(test['price'], test['predicted_price'])

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test['price'], test['predicted_price']))

print('Sample predictions:')
print(test[['price', 'predicted_price', 'price_cluster']].head())

print(f'R² score: {r2:.4f}')
print(f'RMSE: {rmse:.2f}')

Sample predictions:
    price  predicted_price  price_cluster
0   240.0       458.588710              0
1   466.0       190.500819              1
2    62.0       190.500819              1
3  1026.0       458.588710              0
4   972.0       305.631211              2
R² score: 0.0874
RMSE: 285.44
