In [20]:
import os
import numpy as np
import pandas as pd

data_dir = '../data'
filename = 'AB_NYC_2019.csv'
data_path = os.path.join(data_dir, filename)

df = pd.read_csv(data_path)

In [21]:
df['last_review'] = pd.to_datetime(df['last_review'])

In [22]:
no_hostname = df[df['host_name'].isnull()]
df.drop(index=no_hostname.index, inplace=True)

In [23]:
no_name = df[df['name'].isnull()]
df.drop(index=no_name.index, inplace=True)

In [24]:
# About of 10k (almost quarter of full dataset!!!)
# observations don't have useful reviews info
no_info_cond = df['number_of_reviews'] == 0 & \
               df['last_review'].isnull() & \
               df['reviews_per_month'].isnull()
no_info_sample = df[no_info_cond]

df.drop(index=no_info_sample.index, inplace=True)

df.index = range(len(df))

In [25]:
by_room = df.groupby(by='room_type')

def avg_per_sample(group):
    return group.sum() / group.count()

transf = by_room[
    ['number_of_reviews', 'minimum_nights', 'reviews_per_month']
].transform(avg_per_sample)

In [26]:
num_df = df.select_dtypes(include=np.number)
diff_mean_df = num_df.apply(lambda feature: abs(feature - feature.mean()))

In [27]:
def extract_date(data):
    return data.dt.year, data.dt.month, data.dt.day

year, month, day = extract_date(df['last_review'])

df[['review_year', 'review_month',
    'review_day']] = pd.DataFrame({'year': year,
                        'month': month,
                        'day': day})

df.drop(columns='last_review', inplace=True)

In [28]:
by_nhgroup = df.groupby(by='neighbourhood_group')

# FE (1 part)
df[['avg_reviews', 'avg_min_nights',
    'avg_reviews_per_month']] = transf

def diff_avg(group):
    return abs(group - group.mean())

diff_avg = by_nhgroup[
    ['number_of_reviews', 'availability_365']
].apply(diff_avg)
df[['diff_med_reviews', 'diff_med_availability']] = diff_avg

df['year_available'] = df['availability_365'] == 365

df['host_count_mean_diff'] = diff_mean_df['calculated_host_listings_count']
df['minimum_nights_mean_diff'] = diff_mean_df['minimum_nights']

df['review_period'] = df['number_of_reviews'] / df['reviews_per_month']

In [29]:
# FE (2 part)
"""
by_hostname = df.groupby(by='host_name')

def diff_median(group):
    return abs(group - group.median())

diff_med_reviews = by_hostname[
    ['number_of_reviews', 'reviews_per_month']
].apply(diff_median)
df[['reviews_by_hostname',
    'reviews_per_month_by_hostname']] = diff_med_reviews
"""

df['is_rare_type'] = df['room_type'] == 'Shared room'

df['reviews_per_host'] = df['number_of_reviews'] / df['calculated_host_listings_count']

df['min_available'] = df['minimum_nights'] * df['availability_365']

In [30]:
target = df['price']
df.drop(columns=['price'], inplace=True)

target.replace(to_replace=0, value=target.mean(), inplace=True)
target = np.log(target)

In [31]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import (LabelEncoder, OneHotEncoder,
                                   StandardScaler, KBinsDiscretizer)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import (mean_squared_error, r2_score, precision_score,
                             accuracy_score, confusion_matrix, recall_score)

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

lin_reg = LinearRegression(n_jobs=2)
log_reg = LogisticRegression(random_state=0, n_jobs=2)

tree_reg = DecisionTreeRegressor(random_state=0)
knn_reg = KNeighborsRegressor(50, leaf_size=40, n_jobs=2)
rf_reg = RandomForestRegressor(random_state=0, n_jobs=2)
gbm_reg = GradientBoostingRegressor(random_state=0)

In [32]:
%time
# For linear models
scaler = StandardScaler()
scaled = scaler.fit_transform(df[num_df.columns])
df[num_df.columns] = pd.DataFrame(scaled)

Wall time: 0 ns


In [33]:
%%time
to_encode = ['room_type', 'neighbourhood_group']

for col in to_encode:
    le = LabelEncoder()
    ohe = OneHotEncoder(categories='auto')
    labeled = le.fit_transform(df[col])
    labeled = labeled.reshape(len(labeled), 1)
    encoded = ohe.fit_transform(labeled).toarray()
    encoded_df = pd.DataFrame(
        encoded, columns=['is_'+cat for cat in le.classes_]
    )
    df = df.join(encoded_df)
    
df.drop(columns=to_encode, inplace=True)

Wall time: 160 ms


In [34]:
%%time
# Useless unique identifier
df.drop(columns='id', inplace=True)
# Have to make specific FE for sentences
df.drop(columns='name', inplace=True)

to_label = ['host_name', 'neighbourhood']

for col in to_label:
    le = LabelEncoder()
    labeled = le.fit_transform(df[col])
    df[col+'_label'] = labeled
    
df.drop(columns=to_label, inplace=True)

Wall time: 122 ms


#### Logistic regression model

In [35]:
disc = KBinsDiscretizer(n_bins=3, encode='ordinal')
disc_target = np.reshape(target.to_numpy(), (-1, 1))
disc_target = disc.fit_transform(disc_target)
disc_target = disc_target.ravel()

In [36]:
# Train-test split
x, x_test, y, y_test = train_test_split(
    df, disc_target, test_size=0.2, random_state=0, shuffle=True
)
# Train-val split
x_train, x_val, y_train, y_val = train_test_split(
    x, y, train_size=0.8, random_state=0, shuffle=True
)
x_train.shape, x_val.shape, x_test.shape

((24844, 34), (6212, 34), (7765, 34))

In [37]:
log_reg = log_reg.fit(x_train, y_train)

y_pred_train = log_reg.predict(x_train)
acc_train = accuracy_score(y_train, y_pred_train)
confusion_mat = confusion_matrix(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train, average='macro')
recall_train = recall_score(y_train, y_pred_train, average='macro')

y_pred_val = log_reg.predict(x_val)
acc_val = accuracy_score(y_val, y_pred_val)
prec_val = precision_score(y_val, y_pred_val, average='macro')
recall_val = recall_score(y_val, y_pred_val, average='macro')

y_pred = log_reg.predict(x_test)
acc_test = accuracy_score(y_test, y_pred)
prec_test = precision_score(y_test, y_pred, average='macro')
recall_test = recall_score(y_test, y_pred, average='macro')

print(f'Train: accuracy = {acc_train:.3f}, '
      f'precision = {prec_train:.3f}, recall = {recall_train:.3f}')
print(f'Validation: accuracy = {acc_val:.3f} '
      f'precision = {prec_val:.3f}, recall = {recall_val:.3f}')
print(f'Test: accuracy = {acc_test:.3f}, '
      f'precision = {prec_test:.3f}, recall = {recall_test:.3f}')

Train: accuracy = 0.408, precision = 0.406, recall = 0.409
Validation: accuracy = 0.425 precision = 0.422, recall = 0.424
Test: accuracy = 0.402, precision = 0.400, recall = 0.404


#### Linear regression model

In [39]:
# Train-test split
x, x_test, y, y_test = train_test_split(
    df, target, test_size=0.2, random_state=0, shuffle=True
)
# Train-val split
x_train, x_val, y_train, y_val = train_test_split(
    x, y, train_size=0.8, random_state=0, shuffle=True
)

In [40]:
lin_reg = lin_reg.fit(x_train, y_train)

y_pred_train = lin_reg.predict(x_train)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

y_pred_val = lin_reg.predict(x_val)
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

y_pred = lin_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print('Train: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_train, r2_train
))
print('Validation: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_val, r2_val
))
print('Test: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse, r2_test
))

Train: MSE = 0.12670, R2 = 0.713
Validation: MSE = 0.14096, R2 = 0.688
Test: MSE = 0.12146, R2 = 0.716


#### K Nearest Neighbours model

In [41]:
knn_reg = knn_reg.fit(x_train, y_train)

y_pred_train = knn_reg.predict(x_train)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

y_pred_val = knn_reg.predict(x_val)
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

y_pred = knn_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print('Train: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_train, r2_train
))
print('Validation: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_val, r2_val
))
print('Test: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse, r2_test
))

Train: MSE = 0.40553, R2 = 0.081
Validation: MSE = 0.43193, R2 = 0.045
Test: MSE = 0.40989, R2 = 0.042


#### Decision tree model

In [42]:
tree_reg = tree_reg.fit(x_train, y_train)

y_pred_train = tree_reg.predict(x_train)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

y_pred_val = tree_reg.predict(x_val)
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

y_pred = tree_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print('Train: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_train, r2_train
))
print('Validation: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_val, r2_val
))
print('Test: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse, r2_test
))

Train: MSE = 0.00000, R2 = 1.000
Validation: MSE = 0.00001, R2 = 1.000
Test: MSE = 0.00001, R2 = 1.000


#### Random forest model

In [43]:
rf_reg = rf_reg.fit(x_train, y_train)

y_pred_train = rf_reg.predict(x_train)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

y_pred_val = rf_reg.predict(x_val)
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

y_pred = rf_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print('Train: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_train, r2_train
))
print('Validation: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_val, r2_val
))
print('Test: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse, r2_test
))

Train: MSE = 0.00000, R2 = 1.000
Validation: MSE = 0.00001, R2 = 1.000
Test: MSE = 0.00001, R2 = 1.000


#### Gradient boosting model

In [44]:
gbm_reg = gbm_reg.fit(x_train, y_train)

y_pred_train = gbm_reg.predict(x_train)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

y_pred_val = gbm_reg.predict(x_val)
mse_val = mean_squared_error(y_val, y_pred_val)
r2_val = r2_score(y_val, y_pred_val)

y_pred = gbm_reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print('Train: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_train, r2_train
))
print('Validation: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse_val, r2_val
))
print('Test: MSE = {0:.5f}, R2 = {1:.3f}'.format(
    mse, r2_test
))

Train: MSE = 0.00002, R2 = 1.000
Validation: MSE = 0.00002, R2 = 1.000
Test: MSE = 0.00002, R2 = 1.000
