# Airbnb prijsvoorspelling

Gebaseerd op: [Airbnb : price prediction using XGBoost 🔥](https://www.kaggle.com/code/noufalmalki/airbnb-price-prediction-using-xgboost/notebook)

## Probleem definitie
Airbnb gebruikt machine learning om optimale prijsvoorstellen te doen aan hosts. Een goed gekalibreerd prijsmodel helpt hosts hun omzet te maximaliseren en tegelijkertijd concurrerend te blijven.

### Taak, Ervaring
De bedoeling is om numerieke waarden te voorspellen (prijs). Dit gaat over een _regressietaak_. We willen trainen aan de hand van effectieve prijzen. We hebben dus te maken met _gesuperviseerd_ leren.

## _Data collection_
We gebruiken een dataset van Airbnb die beschikbaar is op [Kaggle](https://www.kaggle.com/datasets/stevezhenghp/airbnb-price-prediction) met prijzen uit verschillende grote Amerikaanse steden. We richten ons op volgende variabelen:
- **Numeriek**: bedrooms, bathrooms, review scores, etc.
- **Categorisch**: property type, room type, city
- **Text**: amenities
- **Geografisch**: latitude, longitude, neighborhood
- **Target variable**: `log_price`

In [None]:
import os
import re

import kagglehub
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import xgboost as xgb
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder


In [None]:
# Load data
path = kagglehub.dataset_download("stevezhenghp/airbnb-price-prediction")

In [None]:
df = pd.read_csv(os.path.join(path, "train.csv"))

In [None]:
df.head()


## _Data exploration_

In [None]:
# Basic information about our dataset
print("Dataset Information:")
print("=" * 50)
df.info()

print("\nTarget Variable Statistics:")
print("=" * 50)
print(df["log_price"].describe())

# Check for missing values
print("\nMissing Values:")
print("=" * 50)
missing_info = df.isnull().sum()
missing_info = missing_info[missing_info > 0].sort_values(ascending=False)
print(missing_info)

### Verdeling van de target variabele `log_price`
De log-transformatie zorgt voor een meer symmetrische verdeling. Bij lagere prijzen zijn verschillen van een bepaalde grootte belangrijker dan in de hogere regionen. De log-transformatie vertaalt dit naar de numerieke schaal van de target variabele.

In [None]:
# Histogram of log prices
fig = px.histogram(
    df,
    x="log_price",
    nbins=50,
    title="Distribution of Log-Transformed Airbnb Prices",
    labels={"log_price": "Log Price", "count": "Number of Listings"},
    opacity=0.7,
    marginal="box",  # Add box plot on top
)

fig.update_layout(showlegend=False, height=500)

fig.show()

# Calculate actual price statistics (inverse log transformation)
df["price"] = np.exp(df["log_price"])
print("Actual Price Statistics:")
print(f"Mean: ${df.price.mean():.2f}")
print(f"Median: ${df.price.median():.2f}")
print(f"Min: ${df.price.min():.2f}")
print(f"Max: ${df.price.max():.2f}")

In [None]:
# Histogram of actual prices
fig = px.histogram(
    df,
    x="price",
    nbins=50,
    title="Distribution of Actual Airbnb Prices",
    labels={"price": "Actual Price", "count": "Number of Listings"},
    opacity=0.7,
    marginal="box",  # Add box plot on top
)

fig.update_layout(showlegend=False, height=500)

fig.show()

### Categorische features


In [None]:
# Room type distribution
room_type_counts = df["room_type"].value_counts()

fig = px.bar(
    x=room_type_counts.index,
    y=room_type_counts.values,
    title="Distribution of Room Types",
    labels={"x": "Room Type", "y": "Count"},
    color=room_type_counts.index,
    text=room_type_counts.values,
)

fig.update_traces(texttemplate="%{text:.0f}", textposition="outside")
fig.update_layout(showlegend=False, height=500)
fig.show()

# Property type distribution
property_type_counts = df["property_type"].value_counts().head(10)

fig = px.bar(
    x=property_type_counts.index,
    y=property_type_counts.values,
    title="Top 10 Property Types",
    labels={"x": "Property Type", "y": "Count"},
    color=property_type_counts.index,
    text=property_type_counts.values,
)

fig.update_traces(texttemplate="%{text:.0f}", textposition="outside")
fig.update_layout(showlegend=False, height=500)
fig.update_xaxes(tickangle=45)
fig.show()

In [None]:
# City distribution
city_counts = df["city"].value_counts()

fig = px.bar(
    x=city_counts.index,
    y=city_counts.values,
    title="Distribution of Listings by City",
    labels={"x": "City", "y": "Number of Listings"},
    color=city_counts.index,
    text=city_counts.values,
)

fig.update_traces(texttemplate="%{text:.0f}", textposition="outside")
fig.update_layout(showlegend=False, height=500)
fig.show()

# Average price by city
city_avg_price = df.groupby("city")["log_price"].mean().sort_values(ascending=False)

fig = px.bar(
    x=city_avg_price.index,
    y=city_avg_price.values,
    title="Average Log Price by City",
    labels={"x": "City", "y": "Average Log Price"},
    color=city_avg_price.values,
    color_continuous_scale="Viridis",
    text=[f"{val:.3f}" for val in city_avg_price.values],
)

fig.update_traces(texttemplate="%{text}", textposition="outside")
fig.update_layout(showlegend=False, height=500)
fig.show()

### Geografische analyse

In [None]:
def create_price_map(city_name, df_sample):
    """Create an interactive map showing Airbnb prices for a specific city."""
    # Sample data for performance (use fraction based on city size)
    sample_frac = 0.3 if city_name in ["NYC", "LA"] else 0.8

    city_data = df_sample[df_sample["city"] == city_name].sample(frac=sample_frac, random_state=42)

    # Create the map
    fig = px.scatter_map(
        city_data,
        lat="latitude",
        lon="longitude",
        color="log_price",
        color_continuous_scale="Viridis",
        range_color=[df["log_price"].min(), df["log_price"].max()],
        hover_data={
            "log_price": ":.3f",
            "room_type": True,
            "bedrooms": True,
            "neighbourhood": True,
        },
        title=f"Airbnb Prices in {city_name}",
        labels={"log_price": "Log Price", "room_type": "Room Type"},
        zoom=10,
        height=600,
    )

    fig.update_layout(
        mapbox_style="open-street-map",
        coloraxis_colorbar={
            "title": "Log Price",
            "thicknessmode": "pixels",
            "thickness": 30,
            "lenmode": "fraction",
            "len": 0.8,
        },
    )

    return fig


# Create maps for major cities
cities_to_visualize = ["NYC", "LA", "Chicago", "Boston"]

for city in cities_to_visualize:
    if city in df["city"].unique():
        fig = create_price_map(city, df)
        fig.show()

### Numerieke features: Correlatie analyse

In [None]:
# Select numerical columns for correlation analysis
numerical_cols = [
    "log_price",
    "bedrooms",
    "bathrooms",
    "review_scores_rating",
    "number_of_reviews",
    "beds",
]

# Calculate correlation matrix
corr_matrix = df[numerical_cols].corr()

# Create heatmap
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    aspect="auto",
    title="Correlation Heatmap of Numerical Features",
    color_continuous_scale="RdBu_r",
    zmin=-1,
    zmax=1,
)

fig.update_layout(height=800, width=800)
fig.show()

# Show correlation with target variable
target_corr = corr_matrix["log_price"].drop("log_price").sort_values(ascending=False)

fig = px.bar(
    x=target_corr.index,
    y=target_corr.values,
    title="Feature Correlation with Log Price",
    labels={"x": "Feature", "y": "Correlation Coefficient"},
    color=target_corr.values,
    color_continuous_scale="RdBu_r",
    text=[f"{val:.3f}" for val in target_corr.values],
)

fig.update_traces(texttemplate="%{text}", textposition="outside")
fig.update_layout(showlegend=False, height=500)
fig.show()

## _Data preparation_

### _Missing values_

In [None]:
# 1. Bathrooms - Fill with median (1.0 is most common)
print(f"Bathrooms missing: {df['bathrooms'].isnull().sum()}")
print(f"Bathrooms distribution:\n{df['bathrooms'].value_counts()}")
df["bathrooms"] = df["bathrooms"].fillna(1.0)
print(f"After filling: {df['bathrooms'].isnull().sum()} missing\n")

# 2. Review scores - Fill with 0 (indicates no reviews)
print(f"Review scores missing: {df['review_scores_rating'].isnull().sum()}")
print(f"Review scores distribution:\n{df['review_scores_rating'].value_counts().head()}")
df["review_scores_rating"] = df["review_scores_rating"].fillna(0)
print(f"After filling: {df['review_scores_rating'].isnull().sum()} missing\n")

# 3. Bedrooms - Fill with median (1.0 is most common)
print(f"Bedrooms missing: {df['bedrooms'].isnull().sum()}")
df["bedrooms"] = df["bedrooms"].fillna(1.0)
print(f"After filling: {df['bedrooms'].isnull().sum()} missing\n")

# 4. Beds - Fill with median (1.0 is most common)
print(f"Beds missing: {df['beds'].isnull().sum()}")
df["beds"] = df["beds"].fillna(1.0)
print(f"After filling: {df['beds'].isnull().sum()} missing\n")

# 5. Host response rate - Fill with mean
print(f"Host response rate missing: {df['host_response_rate'].isnull().sum()}")
if df["host_response_rate"].isnull().sum() > 0:
    # Convert percentage strings to numeric
    df["host_response_rate"] = df["host_response_rate"].apply(
        lambda x: float(str(x).rstrip("%")) / 100 if pd.notnull(x) and isinstance(x, str) else x
    )
    mean_response_rate = df["host_response_rate"].mean()
    df["host_response_rate"] = df["host_response_rate"].fillna(mean_response_rate)
    print(f"After filling: {df['host_response_rate'].isnull().sum()} missing\n")

# 6. Host has profile pic - Fill with mode
print(f"Host has profile pic missing: {df['host_has_profile_pic'].isnull().sum()}")
if df["host_has_profile_pic"].isnull().sum() > 0:
    mode_profile_pic = df["host_has_profile_pic"].mode()[0]
    df["host_has_profile_pic"] = df["host_has_profile_pic"].fillna(mode_profile_pic)
    print(f"After filling: {df['host_has_profile_pic'].isnull().sum()} missing\n")

# 7. Host identity verified - Fill with mode
print(f"Host identity verified missing: {df['host_identity_verified'].isnull().sum()}")
if df["host_identity_verified"].isnull().sum() > 0:
    mode_identity_verified = df["host_identity_verified"].mode()[0]
    df["host_identity_verified"] = df["host_identity_verified"].fillna(mode_identity_verified)
    print(f"After filling: {df['host_identity_verified'].isnull().sum()} missing\n")

print("Final missing values check:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

### Data types

In [None]:
# Convert boolean columns from 't/f' strings to 1/0 integers
boolean_columns = [
    "cleaning_fee",
    "instant_bookable",
    "host_has_profile_pic",
    "host_identity_verified",
]

for col in boolean_columns:
    print(f"Converting {col}:")
    print(f"Before: {df[col].unique()}")
    df[col] = df[col].map({"t": 1, "f": 0, True: 1, False: 0}).astype(int)
    print(f"After: {df[col].unique()}")
    print(f"Data type: {df[col].dtype}\n")

# Convert review scores from 0-100 scale to 0-1 scale
print(
    f"Review scores range before: {df.review_scores_rating.min()} - {df.review_scores_rating.max()}"
)
df["review_scores_rating"] = df["review_scores_rating"] / 100
print(
    f"Review scores range after: {df.review_scores_rating.min()} - {df.review_scores_rating.max()}"
)

# Normalize number of reviews (divide by max to get 0-1 scale)
max_reviews = df.number_of_reviews.max()
print(f"Max reviews: {max_reviews}")
df["number_of_reviews"] = df.number_of_reviews / max_reviews
print(f"Normalized reviews range: {df.number_of_reviews.min()} - {df.number_of_reviews.max()}")

## Feature engineering

### Amenities
Deze variabele bestaat uit een reeks van voorzieningen en is in die vorm niet bruikbaar. We zetten de meest frequente om naar binaire features (aan-/afwezig)

In [None]:
df.amenities.values[:3]

In [None]:
def extract_amenities(amenities_str):
    """Extract individual amenities from the amenities string."""
    if pd.isna(amenities_str):
        return []
    # Remove quotes and braces, split by comma
    amenities_list = re.sub(r'["{}]', "", amenities_str).split(",")
    # Clean and filter out empty strings and translation missing
    amenities_list = [
        amenity.strip()
        for amenity in amenities_list
        if amenity.strip() and "translation missing" not in amenity
    ]
    return amenities_list


# Extract all unique amenities across the dataset
all_amenities = set()
for amenities_str in df["amenities"]:
    all_amenities.update(extract_amenities(amenities_str))

print(f"Total unique amenities found: {len(all_amenities)}")
print("\nTop 20 most common amenities:")
amenity_counts = {}
for amenities_str in df["amenities"]:
    for amenity in extract_amenities(amenities_str):
        amenity_counts[amenity] = amenity_counts.get(amenity, 0) + 1

# Sort amenities by count and display top 20
top_amenities = sorted(amenity_counts.items(), key=lambda x: x[1], reverse=True)[:20]
for amenity, count in top_amenities:
    print(f"- {amenity}: {count} listings ({count / len(df) * 100:.1f}%)")

In [None]:
# Create binary features for top amenities
top_amenities_list = [amenity for amenity, count in top_amenities[:15]]  # Top 15 amenities

for amenity in top_amenities_list:
    feature_name = (
        f"amenity_{amenity.lower().replace(' ', '_').replace('/', '_').replace('-', '_')}"
    )
    df[feature_name] = df["amenities"].apply(lambda x: 1 if amenity in str(x) else 0)

print(f"Added {len(top_amenities_list)} amenity features")
print("New columns:", [col for col in df.columns if col.startswith("amenity_")][:10])

### Categorische encodering

In [None]:
# Property type: group rare categories
print("Property type distribution:")
property_counts = df["property_type"].value_counts()
print(property_counts)

# Group rare property types
threshold = 300  # Group types with fewer than 300 listings
rare_types = property_counts[property_counts < threshold].index
df["property_type"] = df["property_type"].replace(rare_types, "Other")

In [None]:
# One-hot encoding
categorical_columns = ["cancellation_policy", "city", "property_type", "room_type"]

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_categorical = encoder.fit_transform(df[categorical_columns])

# Get feature names
feature_names = encoder.get_feature_names_out(categorical_columns)

# Create final DataFrame
df_cat = pd.DataFrame(encoded_categorical, columns=feature_names)

In [None]:
# Bed type encoding (simplified)
print("Bed type distribution:")
print(df["bed_type"].value_counts())

# Convert to binary: Real Bed (2) vs Other (1)
df["bed_type"] = (
    df["bed_type"]
    .map({"Real Bed": 2, "Futon": 1, "Pull-out Sofa": 1, "Airbed": 1, "Couch": 1})
    .fillna(1)
)

print("\nConverted bed_type to ordinal scale (1-2):")
print(df["bed_type"].value_counts())

In [None]:
df_cat.head()

### Neighborbood pricing
Nieuwe feature met ordinale buurt-specifieke prijs-niveaus

In [None]:
# Create price per bedroom feature
df["price_per_bedroom"] = df["log_price"] / df["bedrooms"]

# Handle infinite values (division by zero)
df["price_per_bedroom"] = df["price_per_bedroom"].replace([np.inf, -np.inf], np.nan)

# Calculate average price per bedroom by neighborhood
neighborhood_avg = df.groupby("neighbourhood")["price_per_bedroom"].mean()

# Handle any remaining infinite values in neighborhood averages
neighborhood_avg = neighborhood_avg.replace([np.inf, -np.inf], np.nan)
neighborhood_avg = neighborhood_avg.fillna(neighborhood_avg.mean())

print("Top 10 most expensive neighborhoods:")
print(neighborhood_avg.sort_values(ascending=False).head(10))
print("\nTop 10 least expensive neighborhoods:")
print(neighborhood_avg.sort_values(ascending=True).head(10))

In [None]:
# Create neighborhood price level categories
def categorize_neighborhood(price_per_bedroom):
    """Categorize neighborhoods into price levels."""
    if pd.isna(price_per_bedroom):
        return 2  # Default to middle category

    percentiles = neighborhood_avg.quantile([0.25, 0.5, 0.75])

    if price_per_bedroom <= percentiles[0.25]:
        return 1  # Low price area
    if price_per_bedroom <= percentiles[0.75]:
        return 2  # Medium price area
    return 3  # High price area


# Apply categorization
df["neighborhood_price_level"] = df["price_per_bedroom"].map(lambda x: categorize_neighborhood(x))

print("Neighborhood price level distribution:")
print(df["neighborhood_price_level"].value_counts())

# Clean up - remove temporary columns
df = df.drop(["price_per_bedroom", "neighbourhood"], axis=1)

### Finale data

In [None]:
cols = [
    "log_price",
    "accommodates",
    "bathrooms",
    "bed_type",
    "city",  # keep city in for now, to be used for stratified sampling (see below)
    "cleaning_fee",
    "host_has_profile_pic",
    "host_identity_verified",
    "host_response_rate",
    "instant_bookable",
    "number_of_reviews",
    "review_scores_rating",
    "bedrooms",
    "beds",
    "amenity_wireless_internet",
    "amenity_kitchen",
    "amenity_heating",
    "amenity_essentials",
    "amenity_smoke_detector",
    "amenity_air_conditioning",
    "amenity_tv",
    "amenity_shampoo",
    "amenity_hangers",
    "amenity_carbon_monoxide_detector",
    "amenity_internet",
    "amenity_laptop_friendly_workspace",
    "amenity_washer",
    "amenity_hair_dryer",
    "amenity_dryer",
    "neighborhood_price_level",
]

df = pd.concat([df[cols], df_cat], axis=1)

df.info()

In [None]:
# Create train+validation/test splits
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["city"],  # Stratify by city for balanced representation
)

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"\nSplit verification: {train_df.shape[0] + test_df.shape[0]} total samples")

In [None]:
## Drop stratification column from features
train_df = train_df.drop("city", axis=1)
X_train = train_df.loc[:, train_df.columns != "log_price"]
y_train = train_df["log_price"]
print(f"Training features shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")

test_df = test_df.drop("city", axis=1)
X_test = test_df.loc[:, test_df.columns != "log_price"]
y_test = test_df["log_price"]
print(f"Test features shape: {X_test.shape}")
print(f"Test target shape: {y_test.shape}")

## Model definition

### Lineaire regressie

In [None]:
# Linear Regression with cross-validation
linear_model = LinearRegression()

# 5-fold cross-validation
cv_scores = cross_val_score(linear_model, X_train, y_train, cv=5, scoring="r2")

print(f"Linear Regression Cross-Validation R² Scores: {cv_scores}")
print(f"Mean R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

### XGBoost Regression

In [None]:
# XGBoost with cross-validation
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=6)

cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring="r2")

print(f"XGBoost Cross-Validation R² Scores: {cv_scores_xgb}")
print(f"Mean R²: {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std() * 2:.4f})")

# Compare all models
print("\nFinal Model Comparison:")
print(f"Linear Regression: {cv_scores.mean():.4f}")
print(f"XGBoost:          {cv_scores_xgb.mean():.4f}")

# Train final model on full training data
xgb_model.fit(X_train, y_train)

#### Scoring

In [None]:
# Make predictions on validation set
test_predictions = xgb_model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, test_predictions)
mse = mean_squared_error(y_test, test_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, test_predictions)

print("Validation Set Performance:")
print(f"MAE:  {mae:.4f}")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²:   {r2:.4f}")

# Convert back to actual prices for interpretation
y_val_actual = np.exp(y_test)
val_predictions_actual = np.exp(test_predictions)
mae_actual = mean_absolute_error(y_val_actual, val_predictions_actual)

print("\nIn actual dollars:")
print(f"Mean actual price: ${y_val_actual.mean():.2f}")
print(f"MAE in dollars: ${mae_actual:.2f}")
print(f"This means our model is off by about ${mae_actual:.2f} on average")

In [None]:
# Feature importance from XGBoost
feature_importance = pd.DataFrame(
    {"feature": X_train.columns, "importance": xgb_model.feature_importances_}
).sort_values("importance", ascending=False)

# Plot top 15 most important features
top_features = feature_importance.head(15)

fig = px.bar(
    x=top_features["importance"],
    y=top_features["feature"],
    orientation="h",
    title="Top 15 Most Important Features (XGBoost)",
    labels={"x": "Feature Importance", "y": "Feature"},
    text=[f"{val:.4f}" for val in top_features["importance"]],
)

fig.update_traces(texttemplate="%{text}", textposition="outside")
fig.update_layout(height=600, yaxis={"categoryorder": "total ascending"})
fig.show()

print("\nBusiness Insights from Feature Importance:")
for i, row in top_features.head(10).iterrows():
    print(f"{i + 1}. {row['feature']}: {row['importance']:.4f}")

In [None]:
# Create prediction vs actual plot
fig = px.scatter(
    x=y_test,
    y=test_predictions,
    title="Predicted vs Actual Log Prices (Test Set)",
    labels={"x": "Actual Log Price", "y": "Predicted Log Price"},
    trendline="ols",
    opacity=0.6,
)

# Add perfect prediction line
fig.add_trace(
    go.Scatter(
        x=[y_test.min(), y_test.max()],
        y=[y_test.min(), y_test.max()],
        mode="lines",
        name="Perfect Prediction",
        line={"color": "red", "dash": "dash"},
    )
)

fig.update_layout(height=500, width=800)
fig.show()

# Residual plot
residuals = y_test - test_predictions

fig = px.scatter(
    x=test_predictions,
    y=residuals,
    title="Residual Plot (Test Set)",
    labels={"x": "Predicted Log Price", "y": "Residual (Actual - Predicted)"},
    opacity=0.6,
)

# Add zero line
fig.add_hline(y=0, line_dash="dash", line_color="red")
fig.update_layout(height=500, width=800)
fig.show()