### Import Libraries

In [5]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

### For Colab Env

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/MyDrive/UNI-STUFF/SC1015/airbnb_with_landmark_metro_features.csv")

### Format and Clean Base Data

In [8]:
# Drop irrelevant columns
columns_to_drop = [
    'thumbnail_url', 'amenities', 'description', 'host_has_profile_pic',
    'host_identity_verified', 'host_response_rate', 'host_since', 'name',
    'neighbourhood', 'zipcode'
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Drop rows with missing values
df.dropna(inplace=True)

# Condense room_type
df['room_type'] = df['room_type'].replace({
    'Entire home/apt': "Private",
    'Private room': "Shared",
    'Shared room': "Shared"
})

# Condense property_type
property_typ_vc = df["property_type"].value_counts() / 50
others = property_typ_vc[property_typ_vc < 5].index
df['property_type'] = df['property_type'].replace(others, 'Other')

# Ensure review dates are in datetime format
df['first_review'] = pd.to_datetime(df['first_review'], errors='coerce')
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

# Create a review duration feature
df['review_duration'] = (df['last_review'] - df['first_review']).dt.days

# Prepare a new cleaned DataFrame
clean_df = df[[
    'property_type', 'room_type', 'accommodates', 'bathrooms', 'bed_type',
    'cancellation_policy', 'cleaning_fee', 'city', 'instant_bookable',
    'latitude', 'longitude', 'number_of_reviews', 'review_scores_rating',
    'bedrooms', 'beds'
]].copy()

# Add transformed or derived features
clean_df['price'] = np.exp(df['log_price'])  # Transform log_price back to price
clean_df['instant_bookable'] = clean_df['instant_bookable'].map({'t': True, 'f': False})  # Map to boolean
clean_df['total_review_score'] = clean_df['number_of_reviews'] * clean_df['review_scores_rating']  # Derived feature

# Create seperate dataframe with engineered features
temp_df = df[['landmarks_within_500m', 'landmarks_within_1000m',
                'landmarks_within_2000m', 'avg_distance_to_landmarks',
                'metro_within_500m', 'metro_within_1000m', 'metro_within_5000m',
                'shortest_distance_to_metro']].copy()
eng_df = pd.concat([clean_df, temp_df], axis=1)

## Step 1: Define Utility Functions
Create reusable functions for:
- Preparing features and target variables dynamically.
- Evaluating models and printing metrics.
- Running hyperparameter tuning using grid search.

In [16]:
# Step 1: Define Utility Functions
def prepare_data(df, target_column):
    """
    Prepare features and target variables for a dataset.

    Args:
        df (pd.DataFrame): The input dataset.
        target_column (str): The name of the target column.

    Returns:
        X_train, X_test, y_train, y_test: Train-test split datasets.
    """
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # One-hot encode categorical variables
    X = pd.get_dummies(X, drop_first=True)

    # Split into training and testing sets
    return train_test_split(X, y, test_size=0.2, random_state=42)

def evaluate(model, X_train, X_test, y_train, y_test):
    """
    Fit the model, make predictions, and compute evaluation metrics.

    Args:
        model: The ML model to evaluate.
        X_train, X_test, y_train, y_test: Train-test split datasets.

    Returns:
        dict: A dictionary containing R² and MSE metrics.
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred)
    }

def print_metrics(model, X_train, X_test, y_train, y_test):
    """
    Print evaluation metrics for a given model.
    """
    metrics = evaluate(model, X_train, X_test, y_train, y_test)
    print(f"R² Score: {metrics['r2']}")
    print(f"Root Mean Squared Error: {metrics['mse'] ** 0.5}")

def hyperparameter_tuning(X_train, y_train):
    """
    Perform hyperparameter tuning using HalvingGridSearchCV.

    Args:
        X_train, y_train: Training data.

    Returns:
        best_model: The best model from grid search.
    """
    param_grid = {
        "device": ["cpu"],  # Use GPU if available
        "eta": np.arange(0.01, 0.51, 0.05),  # Learning rate
        "max_depth": range(3, 8),  # Tree depth
        "lambda": range(0, 5)  # Regularization parameter
    }

    grid_search = HalvingGridSearchCV(
        estimator=XGBRegressor(random_state=42),
        param_grid=param_grid,
        cv=3,
        factor=2,
        scoring="r2",
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

## Step 2: Prepare Data Dynamically
Apply the data preparation function to both `clean_df` and `eng_df` for dynamic feature selection.

In [11]:
# Step 2: Prepare Data Dynamically
# For clean_df
X_train_clean, X_test_clean, y_train_clean, y_test_clean = prepare_data(clean_df, target_column="price")

# For eng_df
X_train_eng, X_test_eng, y_train_eng, y_test_eng = prepare_data(eng_df, target_column="price")

## Step 3: Train and Evaluate Models
Train models on both datasets and evaluate their performance.

In [14]:
# Step 3: Train and Evaluate Models
# Train and evaluate on clean_df
print("Evaluation for clean_df:")
model_clean = XGBRegressor(random_state=42, device="cpu")
print_metrics(model_clean, X_train_clean, X_test_clean, y_train_clean, y_test_clean)

# Train and evaluate on eng_df
print("Evaluation for eng_df:")
model_eng = XGBRegressor(random_state=42, device="cpu")
print_metrics(model_eng, X_train_eng, X_test_eng, y_train_eng, y_test_eng)

Evaluation for clean_df:
R² Score: 0.6524236320093406
Root Mean Squared Error: 79.1567399482531
Evaluation for eng_df:
R² Score: 0.6662380750111618
Root Mean Squared Error: 77.56774659113088


## Step 4: Hyperparameter Tuning
Perform hyperparameter tuning for XGBoost on both datasets and evaluate the best models.

In [17]:
# Step 4: Hyperparameter Tuning
print("Hyperparameter Tuning for clean_df:")
best_model_clean = hyperparameter_tuning(X_train_clean, y_train_clean)
print_metrics(best_model_clean, X_train_clean, X_test_clean, y_train_clean, y_test_clean)

print("Hyperparameter Tuning for eng_df:")
best_model_eng = hyperparameter_tuning(X_train_eng, y_train_eng)
print_metrics(best_model_eng, X_train_eng, X_test_eng, y_train_eng, y_test_eng)

Hyperparameter Tuning for clean_df:
n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 357
max_resources_: 45702
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 250
n_resources: 357
Fitting 3 folds for each of 250 candidates, totalling 750 fits
----------
iter: 1
n_candidates: 125
n_resources: 714
Fitting 3 folds for each of 125 candidates, totalling 375 fits
----------
iter: 2
n_candidates: 63
n_resources: 1428
Fitting 3 folds for each of 63 candidates, totalling 189 fits
----------
iter: 3
n_candidates: 32
n_resources: 2856
Fitting 3 folds for each of 32 candidates, totalling 96 fits
----------
iter: 4
n_candidates: 16
n_resources: 5712
Fitting 3 folds for each of 16 candidates, totalling 48 fits
----------
iter: 5
n_candidates: 8
n_resources: 11424
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 6
n_candidates: 4
n_resources: 22848
Fitting 3 folds for each of 4 candidates, totalling 12 fits
--------

### Key Findings
1. Engineered Features Help:
  - Incorporating engineered features related to metro station proximity slightly improved the model’s ability to predict Airbnb prices. This supports the hypothesis that geography, specifically proximity to landmarks and metro stations, influences pricing.
2. Overfitting Risk:
  - The decline in performance after hyperparameter tuning suggests the potential for overfitting or a mismatch in parameter settings for the given data.
3. Room for Improvement:
  - An R² of ~0.65 indicates that there are other important factors (e.g., seasonal trends, detailed property descriptions, host quality) that the model has not accounted for.