In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from geopy.distance import geodesic
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from category_encoders import TargetEncoder



print("Done!")

In [None]:
dataset = "D:/CDEK/AB_NYC_2019.csv"
data = pd.read_csv(dataset)
print(data.info())
print(data.describe())
print(data.isnull().sum())

plt.figure(figsize=(10, 4))
sns.histplot(data['price'], bins = 500, kde = True)
plt.title('Price distr')
plt.show

plt.figure(figsize=(12,6))
sns.scatterplot(x = 'longitude', y = 'latitude', hue = 'price', data=data, palette = 'coolwarm', alpha = 0.5)
plt.title("Geo price distr")
plt.show()

In [None]:
data['name'].fillna('Unknown', inplace = True)
data['host_name'].fillna('Unknown', inplace = True)
data['last_review'].fillna('2000-01-01', inplace=True) #for test preprocessing
data['reviews_per_month'].fillna(0, inplace=True)
data['last_review'] = pd.to_datetime(data['last_review'])
data['last_review_year'] = data['last_review'].dt.year
data['price_per_review'] = data['price'] / (data['number_of_reviews'] + 1)

TIMES_SQUARE = (40.7589, -73.9851)
data['distance_to_center'] = data.apply(lambda row: geodesic((row['latitude'], row['longitude']), TIMES_SQUARE).km, axis = 1)

label_encoder = LabelEncoder()
data['neighbourhood'] = label_encoder.fit_transform(data['neighbourhood'])
data['neighbourhood_group'] = label_encoder.fit_transform(data['neighbourhood_group'])
data['room_type'] = label_encoder.fit_transform(data['room_type'])

scaler = StandardScaler()
numeric_columns = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

print(data.info())
print(data.head())

In [None]:
numeric_data = data.select_dtypes(include=[np.number])
correlation_matrix = numeric_data.corr()

plt.figure(figsize = (15, 15))
sns.heatmap(correlation_matrix, annot = True, fmt = ".4f", cmap = "coolwarm", cbar = True)
plt.show()


In [None]:
columns_to_drop = ['price_per_review', 'id']
data.drop(columns = columns_to_drop, inplace = True)

print(data.info())
print(data.head())

In [None]:
X = data.drop(columns = ['price'])
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

encoder = TargetEncoder()
X_train['name'] = encoder.fit_transform(X_train['name'], y_train)
X_test['name'] = encoder.transform(X_test['name'])

print(X_train.dtypes)
print(X_test.dtypes)

In [None]:
model_dt = DecisionTreeRegressor(max_depth = 2, random_state = 42)
model_dt.fit(X_train, y_train)

y_pred_dt = model_dt.predict(X_test)

mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("DecisionTreeRegressor:")
print(f"MAE: {mae_dt:.4f}")
print(f"R²: {r2_dt:.4f}")

In [None]:
model_rf = RandomForestRegressor(n_estimators = 1000, max_depth = 20, random_state = 42)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("RandomForestRegressor:")
print(f"MAE: {mae_rf:.2f}")
print(f"R²: {r2_rf:.2f}")

In [None]:
model_xgb = XGBRegressor(n_estimators = 5000, learning_rate = 0.001, max_depth = 10, random_state = 42)
model_xgb.fit(X_train, y_train)

y_pred_xgb = model_xgb.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBRegressor:")
print(f"MAE: {mae_xgb:.2f}")
print(f"R²: {r2_xgb:.2f}")

In [None]:
model_lgbm = LGBMRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 2, random_state = 42)
model_lgbm.fit(X_train, y_train)

y_pred_lgbm = model_lgbm.predict(X_test)

mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)

print("LGBMRegressor:")
print(f"MAE: {mae_lgbm:.2f}")
print(f"R²: {r2_lgbm:.2f}")

In [None]:
model_cb = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=5, random_state=42, verbose=100)
model_cb.fit(X_train, y_train)

y_pred_cb = model_cb.predict(X_test)

mae_cb = mean_absolute_error(y_test, y_pred_cb)
r2_cb = r2_score(y_test, y_pred_cb)

print("CatBoostRegressor:")
print(f"MAE: {mae_cb:.2f}")
print(f"R²: {r2_cb:.2f}")