In [3]:
# ==============================
# STEP 1: DATA UNDERSTANDING
# ==============================

import pandas as pd
import numpy as np

# Load March datasets
march_listings = pd.read_csv("March_23_2025_listings.csv")
march_calendar = pd.read_csv("March_23_2025_calendar.csv")
march_reviews = pd.read_csv("March_23_2025_reviews.csv")
march_neigh = pd.read_csv("March_23_2025_neighbourhoods.csv")

# Load December datasets
dec_listings = pd.read_csv("December_29_2024_listings.csv")
dec_calendar = pd.read_csv("December_29_2024_calendar.csv")
dec_reviews = pd.read_csv("December_29_2024_reviews.csv")
dec_neigh = pd.read_csv("December_29_2024_neighbourhoods.csv")

# Quick overview
print("March Listings Shape:", march_listings.shape)
print("December Listings Shape:", dec_listings.shape)
print("March Calendar Shape:", march_calendar.shape)
print("December Calendar Shape:", dec_calendar.shape)
print("March Reviews Shape:", march_reviews.shape)
print("December Reviews Shape:", dec_reviews.shape)

# Check first rows to understand columns
print(march_listings.head())
print(march_calendar.head())

FileNotFoundError: [Errno 2] No such file or directory: 'March_23_2025_listings.csv'

In [32]:
# ==============================
# STEP 2: DATA PREPARATION
# ==============================

# ---------- CLEAN PRICE COLUMN ----------
def clean_price(price_series):
    return (price_series
            .astype(str)
            .str.replace(r'[\$,]', '', regex=True)
            .str.strip()
            .replace('', np.nan)
            .astype(float))

# Clean prices
march_calendar['price'] = clean_price(march_calendar['price'])
dec_calendar['price'] = clean_price(dec_calendar['price'])
march_listings['price'] = clean_price(march_listings['price'])
dec_listings['price'] = clean_price(dec_listings['price'])

# ---------- CALCULATE AVAILABILITY ----------
for cal in [march_calendar, dec_calendar]:
    cal['available_flag'] = cal['available'].map({'t':1,'f':0})

march_avail = march_calendar.groupby('listing_id')['available_flag'].mean().reset_index()
march_avail.rename(columns={'available_flag':'availability_norm'}, inplace=True)

dec_avail = dec_calendar.groupby('listing_id')['available_flag'].mean().reset_index()
dec_avail.rename(columns={'available_flag':'availability_norm'}, inplace=True)

# Add month column
march_listings['month'] = 3
dec_listings['month'] = 12

# Merge availability
march_df = march_listings.merge(march_avail, left_on='id', right_on='listing_id', how='left')
dec_df = dec_listings.merge(dec_avail, left_on='id', right_on='listing_id', how='left')

# Combine months
price_df = pd.concat([march_df, dec_df], axis=0).reset_index(drop=True)

# ---------- HANDLE MISSING VALUES ----------
numeric_cols = ['accommodates','bedrooms','beds','bathrooms','number_of_reviews',
                'review_scores_rating','reviews_per_month','minimum_nights']
for col in numeric_cols:
    if col in price_df.columns:
        price_df[col] = price_df[col].fillna(price_df[col].median())

categorical_cols = ['property_type','room_type','neighbourhood_cleansed','host_is_superhost','instant_bookable']
for col in categorical_cols:
    price_df[col] = price_df[col].fillna('Unknown')

# ---------- FEATURE ENGINEERING ----------
price_df['amenities_count'] = price_df['amenities'].apply(lambda x: len(x.strip('{}').split(',')))

# Remove extreme high prices (top 1%)
price_df = price_df[price_df['price'] < price_df['price'].quantile(0.99)]

# Group rare neighborhoods (<10 listings) into 'Other'
neigh_counts = price_df['neighbourhood_cleansed'].value_counts()
rare_neigh = neigh_counts[neigh_counts < 10].index
price_df['neighbourhood_cleansed'] = price_df['neighbourhood_cleansed'].replace(rare_neigh, 'Other')

# Log-transform price for XGBoost stability
price_df['price_log'] = np.log1p(price_df['price'])

# Logit transform availability
epsilon = 1e-3
price_df['availability_logit'] = np.log((price_df['availability_norm'] + epsilon) /
                                        (1 - price_df['availability_norm'] + epsilon))

# Average historical availability
price_df['avg_availability'] = price_df.groupby('id')['availability_norm'].transform('mean')

# ---------- ENCODE CATEGORICAL FEATURES ----------
from sklearn.preprocessing import LabelEncoder
encoders = {}
for col in ['property_type','room_type','neighbourhood_cleansed','host_is_superhost','instant_bookable']:
    le = LabelEncoder()
    price_df[col] = le.fit_transform(price_df[col])
    encoders[col] = le

# ---------- DEFINE FEATURES ----------
price_features = ['property_type','room_type','neighbourhood_cleansed','accommodates',
                  'bedrooms','beds','bathrooms','amenities_count','host_is_superhost',
                  'instant_bookable','month','reviews_per_month','review_scores_rating','minimum_nights','number_of_reviews']

avail_features = ['property_type','room_type','neighbourhood_cleansed','accommodates',
                  'bedrooms','beds','bathrooms','amenities_count','host_is_superhost',
                  'instant_bookable','month','price_log','avg_availability','reviews_per_month']

# Drop rows with NaNs in targets
price_df = price_df.dropna(subset=['price_log','availability_logit'])

In [35]:
# ==============================
# STEP 3: MODELING + EVALUATION
# ==============================

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# ----- PRICE MODEL (XGBoost) -----
X_price = price_df[price_features]
y_price = price_df['price_log']

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_price, y_price, test_size=0.2, random_state=42)

price_model = XGBRegressor(
    n_estimators=1000,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
price_model.fit(X_train_p, y_train_p)

# Predict
y_test_pred_p = price_model.predict(X_test_p)
y_train_pred_p = price_model.predict(X_train_p)

# Back-transform
y_test_pred_p_real = np.expm1(y_test_pred_p)
y_train_pred_p_real = np.expm1(y_train_pred_p)

# Evaluation
mae_price = mean_absolute_error(np.expm1(y_test_p), y_test_pred_p_real)
rmse_price = np.sqrt(mean_squared_error(np.expm1(y_test_p), y_test_pred_p_real))
r2_price = r2_score(np.expm1(y_test_p), y_test_pred_p_real)

print("PRICE MODEL EVALUATION (XGBoost)")
print(f"R² Score: {r2_price:.4f}")
print(f"MAE: ${mae_price:.2f}")
print(f"RMSE: ${rmse_price:.2f}\n")

# ----- AVAILABILITY MODEL (Random Forest, same as before) -----
X_avail = price_df[avail_features]
y_avail = price_df['availability_logit']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_avail, y_avail, test_size=0.2, random_state=42)

availability_model = RandomForestRegressor(
    n_estimators=800,
    max_depth=30,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
availability_model.fit(X_train_a, y_train_a)

# Predict
y_test_pred_a = availability_model.predict(X_test_a)
y_train_pred_a = availability_model.predict(X_train_a)

# Back-transform
def inverse_logit(x):
    return 1 / (1 + np.exp(-x))

y_test_pred_a_real = inverse_logit(y_test_pred_a)
y_train_pred_a_real = inverse_logit(y_train_pred_a)

# Evaluation
mae_avail = mean_absolute_error(inverse_logit(y_test_a), y_test_pred_a_real)
rmse_avail = np.sqrt(mean_squared_error(inverse_logit(y_test_a), y_test_pred_a_real))
r2_avail = r2_score(inverse_logit(y_test_a), y_test_pred_a_real)

print("AVAILABILITY MODEL EVALUATION (Random Forest)")
print(f"R² Score: {r2_avail:.4f}")
print(f"MAE: {mae_avail*100:.2f}%")
print(f"RMSE: {rmse_avail*100:.2f}%")

PRICE MODEL EVALUATION (XGBoost)
R² Score: 0.7229
MAE: $27.62
RMSE: $44.06

AVAILABILITY MODEL EVALUATION (Random Forest)
R² Score: 0.9017
MAE: 5.99%
RMSE: 9.36%


In [41]:
import joblib

# Save XGBoost Price Model
joblib.dump(price_model, "price_model_xgb.pkl")

joblib.dump(encoders, "price_encoders.pkl")

# Save Random Forest Availability Model
joblib.dump(availability_model, "availability_model_rf.pkl")

print("✅ Models saved successfully!")

✅ Models saved successfully!


In [43]:
import joblib
import os

# Path to save models
project_path = r"C:\Users\Edmar\PycharmProjects\finals"

# Ensure the folder exists
os.makedirs(project_path, exist_ok=True)

# Save XGBoost Price Model
joblib.dump(price_model, os.path.join(project_path, "price_model_xgb.pkl"))

joblib.dump(encoders, os.path.join(project_path, "price_encoders.pkl"))

# Save Random Forest Availability Model
joblib.dump(availability_model, os.path.join(project_path, "availability_model_rf.pkl"))

print("✅ Models saved in your PyCharm project folder!")

✅ Models saved in your PyCharm project folder!
