In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [12]:
aggregated_data = pd.read_csv('chicago_preprocessed.csv')

In [15]:
# Define categorical and numerical columns
categorical_cols = ['Property Type', 'Listing Type', 'Neighborhood']
numerical_cols = [
    'rating_ave_pastYear', 'numReviews_pastYear', 'numCancel_pastYear', 'num_5_star_Rev_pastYear',
    'prop_5_StarReviews_pastYear', 'numReservedDays_pastYear', 'numReserv_pastYear', 'available_days',
    'available_days_aveListedPrice', 'booked_days', 'booked_days_avePrice', 'Bedrooms', 'Bathrooms',
    'Max Guests', 'Cleaning Fee (USD)', 'Minimum Stay', 'Number of Photos', 'Nightly Rate',
    'Number of Reviews', 'Rating Overall', 'revenue', 'property_age_months'
]

# Exclude unnecessary columns
exclude_cols = ['Scraped Date', 'Airbnb Host ID', 'Airbnb Property ID', 'Created Date']

# Preprocess the dataset
def preprocess_data(df, categorical_cols, numerical_cols, reference_columns=None):
    # Drop excluded columns
    df = df.drop(columns=exclude_cols, errors='ignore')
    # Convert categorical columns to dummy variables
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    # Standardize numerical columns
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    # Align columns with reference_columns (if provided)
    if reference_columns is not None:
        for col in reference_columns:
            if col not in df.columns:
                df[col] = 0
        df = df[reference_columns]
    return df

# Split the dataset
training_data = aggregated_data[aggregated_data['superhost_period_all'] <= 18]
testing_data = aggregated_data[aggregated_data['superhost_period_all'] == 19]
prediction_data = aggregated_data[aggregated_data['superhost_period_all'] == 20]

# Preprocess train, test, and prediction sets
X_train = preprocess_data(training_data.drop(['occupancy_rate'], axis=1), categorical_cols, numerical_cols)
y_train = training_data['occupancy_rate']

X_test = preprocess_data(testing_data.drop(['occupancy_rate'], axis=1), categorical_cols, numerical_cols, X_train.columns)
y_test = testing_data['occupancy_rate']

X_pred = preprocess_data(prediction_data.drop(['occupancy_rate'], axis=1), categorical_cols, numerical_cols, X_train.columns)

# Linear Regression Model
print("Training Linear Regression Model...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_test_pred_lr = lr_model.predict(X_test)
mape_lr = mean_absolute_percentage_error(y_test, y_test_pred_lr)
print(f"Linear Regression MAPE: {mape_lr:.4f}")

# Gradient Boosting Model
print("Training Gradient Boosting Model...")
gb_model = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2]
}
gb_grid = GridSearchCV(gb_model, param_grid, scoring='neg_mean_absolute_percentage_error', cv=3, verbose=1, n_jobs=-1)
gb_grid.fit(X_train, y_train)
best_gb_model = gb_grid.best_estimator_
y_test_pred_gb = best_gb_model.predict(X_test)
mape_gb = mean_absolute_percentage_error(y_test, y_test_pred_gb)
print(f"Gradient Boosting MAPE: {mape_gb:.4f}")

# Compare Models
print(f"\nMAPE Comparison:\nLinear Regression: {mape_lr:.4f}\nGuradient Boosting: {mape_gb:.4f}")

# Choose Best Model
best_model = lr_model if mape_lr < mape_gb else best_gb_model
y_pred = best_model.predict(X_pred)

# Save Predictions for Period 20
prediction_data['Predicted Occupancy Rate'] = y_pred
print("Predictions for Period 20:")
print(prediction_data[['Airbnb Host ID', 'Airbnb Property ID', 'Predicted Occupancy Rate']])

Training Linear Regression Model...
Linear Regression MAPE: 1.6750
Training Gradient Boosting Model...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Gradient Boosting MAPE: 1.2121

MAPE Comparison:
Linear Regression: 1.6750
Gradient Boosting: 1.2121
Predictions for Period 20:
       Airbnb Host ID  Airbnb Property ID  Predicted Occupancy Rate
15             2613.0              2384.0                  0.264272
31             5775.0              4505.0                  0.205164
58            17928.0              7126.0                  0.260349
73            33004.0              9811.0                  0.281617
89            33004.0             10945.0                  0.293849
...               ...                 ...                       ...
72122     289257440.0          42714699.0                  0.482583
72123     273627718.0          42746404.0                  0.051665
72124     336396789.0          42776417.0                  0.183601
72125      44131859.0       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_data['Predicted Occupancy Rate'] = y_pred
