In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pickle
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time
import scipy.stats as st
from scipy.stats import randint
from sklearn.model_selection import cross_val_score
import optuna
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
df = pd.read_csv("../data/clean/airbnbnyc_final.csv")
df.head()

Unnamed: 0,host_is_superhost,neighbourhood_group_cleansed,latitude,longitude,room_type,accommodates,bedrooms,price,maximum_nights,availability_365,number_of_reviews,calculated_host_listings_count,reviews_per_month,total_amenities
0,False,Brooklyn,40.64529,-73.97238,Private room,1.098612,0.693147,5.655992,730,347,2.302585,1.94591,0.076961,761
1,False,Manhattan,40.75356,-73.98559,Entire home/apt,0.693147,0.0,5.484797,1125,312,3.912023,1.386294,0.231112,553
2,False,Brooklyn,40.66265,-73.99454,Entire home/apt,1.609438,1.098612,5.375278,730,80,1.609438,0.693147,0.029559,816
3,True,Brooklyn,40.70935,-73.95342,Entire home/apt,1.386294,1.098612,4.584967,120,219,5.283204,0.693147,0.693147,384
4,False,Manhattan,40.78778,-73.94759,Private room,0.693147,1.098612,4.158883,120,254,5.537334,0.693147,0.845868,364


# Train test Split

In [4]:

features = df.drop(columns = ["price"])
target = df["price"]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

# Encoding Categorical Column

In [5]:
ohe = OneHotEncoder(drop="first",sparse_output=False) 
ohe.fit(X_train[['host_is_superhost','neighbourhood_group_cleansed','room_type']]) # The .fit() method determines the unique values of each column
X_train_trans_np = ohe.transform(X_train[['host_is_superhost','neighbourhood_group_cleansed','room_type']])
X_train_trans_df = pd.DataFrame(X_train_trans_np, columns=ohe.get_feature_names_out(), index=X_train.index)
X_test_trans_np = ohe.transform(X_test[['host_is_superhost','neighbourhood_group_cleansed','room_type']])
X_test_trans_df = pd.DataFrame(X_test_trans_np, columns=ohe.get_feature_names_out(), index=X_test.index)

# Saving Encoder

In [6]:
with open("../encoders/one_hot_encoder.pkl", "wb") as file:
    pickle.dump(ohe, file)

# Combining data after encoding

In [7]:
X_train_num=X_train.select_dtypes('number')

X_test_num=X_test.select_dtypes('number')

X_train_full=pd.concat([X_train_trans_df,X_train_num],axis=1)
X_test_full=pd.concat([X_test_trans_df,X_test_num],axis=1)

# Scaling

In [8]:
std_scaler = StandardScaler()
std_scaler.fit(X_train_full)


0,1,2
,copy,True
,with_mean,True
,with_std,True


# Saving scaler

In [9]:
with open("../scalers/standard_scaler.pkl", "wb") as file:
    pickle.dump(std_scaler, file)

In [10]:
X_train_full_np = std_scaler.transform(X_train_full)
X_test_full_np  = std_scaler.transform(X_test_full)

X_train_full_np_df = pd.DataFrame(X_train_full_np, columns=X_train_full.columns, index=X_train_full.index)
X_test_full_np_df  = pd.DataFrame(X_test_full_np, columns=X_test_full.columns, index=X_test_full.index)

# Hyperparameter Tuning

# Grid Search

In [11]:


# Hyperparameter grid
parameter_grid = {
    "n_neighbors": [3,5,10,20],
    "weights": ['uniform', 'distance'],
    "leaf_size": [20,30,40],
    "p":[1,2],
    "metric":['minkowski','manhattan','euclidean']
}

# Model
kr = KNeighborsRegressor()

# GridSearchCV setup
confidence_level = 0.95
folds = 5
gs = GridSearchCV(kr, param_grid=parameter_grid, cv=folds, verbose=1, scoring='r2', n_jobs=-1)

# Fit model
start_time = time.time()
gs.fit(X_train_full_np_df, y_train)
end_time = time.time()

# Results
print(f"\nTime taken: {end_time - start_time:.2f} seconds")
print("Best hyperparameters:", gs.best_params_)
print(f"Cross-validated R2 score: {gs.best_score_:.4f}")

# Confidence interval
results_df = pd.DataFrame(gs.cv_results_).sort_values(by="mean_test_score", ascending=False)
mean_score = results_df.iloc[0]["mean_test_score"]
std_score = results_df.iloc[0]["std_test_score"]
sem = std_score / np.sqrt(folds)
tc = st.t.ppf(1 - ((1 - confidence_level) / 2), df=folds - 1)
lower_bound = mean_score - tc * sem
upper_bound = mean_score + tc * sem

print(f"95% Confidence Interval for R2: ({lower_bound:.4f}, {mean_score:.4f}, {upper_bound:.4f})")

# Final model evaluation
best_model = gs.best_estimator_
y_pred = best_model.predict(X_test_full_np_df)

print("\n Test Evaluation:")
print(f"MAE : {mean_absolute_error(y_test, y_pred):.4f}")
print(f"MSE : {mean_squared_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R2  : {r2_score(y_test, y_pred):.4f}")
print(f"MAPE  : {mean_absolute_percentage_error(y_test, y_pred) * 100:.4f}")

Fitting 5 folds for each of 144 candidates, totalling 720 fits

Time taken: 58.73 seconds
Best hyperparameters: {'leaf_size': 20, 'metric': 'minkowski', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
Cross-validated R2 score: 0.7603
95% Confidence Interval for R2: (0.7464, 0.7603, 0.7742)

 Test Evaluation:
MAE : 0.2878
MSE : 0.1783
RMSE: 0.4222
R2  : 0.7822
MAPE  : 5.6966


# Random Search

In [13]:
# Define parameters
parameter_distributions = {
    "n_neighbors":randint(1,50),
    "weights": ['uniform', 'distance'],
    "p":[1,2],
    "leaf_size":randint(10,100),
    "algorithm":['auto','ball_tree','kd_tree','brute']
    
}

# Model instance
kr1 =  KNeighborsRegressor()

# Setup RandomizedSearchCV
confidence_level = 0.95
folds = 5
n_iter = 30

rs = RandomizedSearchCV(
    kr,
    param_distributions=parameter_distributions,
    n_iter=n_iter,
    cv=folds,
    verbose=10,
    random_state=42,
    n_jobs=-1,
    scoring="r2"
)

# Fit the model
start_time = time.time()
rs.fit(X_train_full_np_df, y_train)
end_time = time.time()

# Display results
print("\n")
print(f"Time taken to find the best combination of hyperparameters: {end_time - start_time:.4f} seconds\n")
print("Best hyperparameters:", rs.best_params_)
print(f"Best cross-validated R2: {rs.best_score_:.4f}")

# Confidence interval calculation
results_rs_df = pd.DataFrame(rs.cv_results_).sort_values(by="mean_test_score", ascending=False)
rs_mean_score = results_rs_df.iloc[0]["mean_test_score"]
rs_std_score = results_rs_df.iloc[0]["std_test_score"]
rs_sem = rs_std_score / np.sqrt(folds)

rs_tc = st.t.ppf(1 - ((1 - confidence_level) / 2), df=folds - 1)
rs_lower_bound = rs_mean_score - rs_tc * rs_sem
rs_upper_bound = rs_mean_score + rs_tc * rs_sem

print(f"R2 confidence interval for the best combination: ({rs_lower_bound:.4f}, {rs_mean_score:.4f}, {rs_upper_bound:.4f})")

# Evaluate on test set
best_model = rs.best_estimator_
y_pred_test = best_model.predict(X_test_full_np_df)

print("\n Test Set Evaluation:")
print(f"Test MAE : {mean_absolute_error(y_test, y_pred_test):.4f}")
print(f"Test MSE : {mean_squared_error(y_test, y_pred_test):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f}")
print(f"Test R2  : {r2_score(y_test, y_pred_test):.4f}")
print(f"MAPE  : {mean_absolute_percentage_error(y_test, y_pred) * 100:.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


Time taken to find the best combination of hyperparameters: 33.8322 seconds

Best hyperparameters: {'algorithm': 'kd_tree', 'leaf_size': 96, 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
Best cross-validated R2: 0.7599
R2 confidence interval for the best combination: (0.7464, 0.7599, 0.7734)

 Test Set Evaluation:
Test MAE : 0.2883
Test MSE : 0.1787
Test RMSE: 0.4227
Test R2  : 0.7817
MAPE  : 5.6966


# Bayesian Search

In [14]:

# Objective function for Bayesian search
def objective(trial, confidence_level, folds):
    # Suggest hyperparameters
    n_neighbors = trial.suggest_int("n_neighbors", 1, 50)
    weights = trial.suggest_categorical("weights",['uniform','distance'])
    p = trial.suggest_int("p", 1, 2)
    leaf_size = trial.suggest_int("leaf_size", 10, 100)
    algorithm = trial.suggest_categorical("agorithm", ['auto','ball_tree','kd_tree','brute'])

    # Initialize model
    kr2 = KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        p=p,
        leaf_size=leaf_size,
        algorithm=algorithm
    )

    # Cross-validation
    scores = cross_val_score(kr2, X_train_full_np_df, y_train, cv=folds, scoring='r2')
    mean_score = np.mean(scores)
    sem = np.std(scores, ddof=1) / np.sqrt(folds)

    # Confidence interval
    tc = st.t.ppf(1 - ((1 - confidence_level) / 2), df=folds - 1)
    lower_bound = mean_score - (tc * sem)
    upper_bound = mean_score + (tc * sem)

    # Store confidence interval in trial
    trial.set_user_attr("CV_score_summary", [
        round(lower_bound, 4),
        round(mean_score, 4),
        round(upper_bound, 4)
    ])

    return mean_score
confidence_level = 0.95
folds = 5

start_time = time.time()
study = optuna.create_study(direction="maximize")  # Maximize R²
study.optimize(lambda trial: objective(trial, confidence_level, folds), n_trials=45)
end_time = time.time()

# Print results
print("\n")
print(f"Time taken: {end_time - start_time: .4f} seconds")
print("\nBest hyperparameters found:", study.best_params)
print(f" Best R² found: {study.best_value: .4f}")

# Extract best trial confidence interval
results = sorted([
    (
        index,
        trial.user_attrs['CV_score_summary'][0],
        trial.user_attrs['CV_score_summary'][1],
        trial.user_attrs['CV_score_summary'][2]
    )
    for index, trial in enumerate(study.trials)
], key=lambda x: x[2], reverse=True)

print(f" R² confidence interval of best trial: {results[0][1:]}")

[I 2025-08-07 17:55:23,435] A new study created in memory with name: no-name-8a552834-64f8-46a5-8bbc-f89f675223db
[I 2025-08-07 17:55:24,408] Trial 0 finished with value: 0.7596222787422835 and parameters: {'n_neighbors': 12, 'weights': 'distance', 'p': 1, 'leaf_size': 25, 'agorithm': 'brute'}. Best is trial 0 with value: 0.7596222787422835.
[I 2025-08-07 17:55:29,143] Trial 1 finished with value: 0.7394211406105685 and parameters: {'n_neighbors': 41, 'weights': 'distance', 'p': 1, 'leaf_size': 79, 'agorithm': 'ball_tree'}. Best is trial 0 with value: 0.7596222787422835.
[I 2025-08-07 17:55:35,364] Trial 2 finished with value: 0.717093166073474 and parameters: {'n_neighbors': 42, 'weights': 'distance', 'p': 2, 'leaf_size': 24, 'agorithm': 'kd_tree'}. Best is trial 0 with value: 0.7596222787422835.
[I 2025-08-07 17:55:42,590] Trial 3 finished with value: 0.7115745092177649 and parameters: {'n_neighbors': 36, 'weights': 'uniform', 'p': 1, 'leaf_size': 23, 'agorithm': 'kd_tree'}. Best is 



Time taken:  91.5593 seconds

Best hyperparameters found: {'n_neighbors': 9, 'weights': 'distance', 'p': 1, 'leaf_size': 36, 'agorithm': 'brute'}
 Best R² found:  0.7610
 R² confidence interval of best trial: (np.float64(0.7454), np.float64(0.761), np.float64(0.7767))


# KNN Regressor

In [16]:
knn = KNeighborsRegressor(n_neighbors=11,algorithm='kd_tree',leaf_size=96,p=1,weights='distance',metric='minkowski')
knn.fit(X_train_full_np_df, y_train)

0,1,2
,n_neighbors,11
,weights,'distance'
,algorithm,'kd_tree'
,leaf_size,96
,p,1
,metric,'minkowski'
,metric_params,
,n_jobs,


In [17]:
with open("../models/knn.pkl", "wb") as file:
    pickle.dump(knn, file)

In [18]:

# Predict on test set
y_pred = knn.predict(X_test_full_np_df)

# Evaluate performance
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred) * 100
# Print metrics
print(f"The R² of the model on the TEST set is: {r2: .2f}")
print(f"Mean Absolute Error (MAE): {mae: .2f}")
print(f"Mean Squared Error (MSE): {mse: .2f}")
print(f"Root Mean Squared Error (RMSE): {rmse: .2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape: .2f}")

The R² of the model on the TEST set is:  0.78
Mean Absolute Error (MAE):  0.29
Mean Squared Error (MSE):  0.18
Root Mean Squared Error (RMSE):  0.42
Mean Absolute Percentage Error (MAPE):  5.70
