In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler,
    LabelEncoder, OneHotEncoder)

data_dir = '../data'
filename = 'AB_NYC_2019.csv'
data_path = os.path.join(data_dir, filename)

df = pd.read_csv(data_path)

#### Data preparation

In [2]:
df['last_review'] = pd.to_datetime(df['last_review'])

In [3]:
no_hostname = df[df['host_name'].isnull()]
df.drop(index=no_hostname.index, inplace=True)

In [4]:
no_name = df[df['name'].isnull()]
df.drop(index=no_name.index, inplace=True)

In [5]:
no_info_cond = df['number_of_reviews'] == 0 & \
               df['last_review'].isnull() & \
               df['reviews_per_month'].isnull()
no_info_sample = df[no_info_cond]

df.drop(index=no_info_sample.index, inplace=True)

df.index = range(len(df))

In [6]:
def extract_date(data):
    return data.dt.year, data.dt.month, data.dt.day

year, month, day = extract_date(df['last_review'])

df[['review_year', 'review_month',
    'review_day']] = pd.DataFrame({'year': year,
                        'month': month,
                        'day': day})

df.drop(columns='last_review', inplace=True)

In [7]:
# FE
df['year_available'] = df['availability_365'] == 365
df['review_period'] = df['number_of_reviews'] / df['reviews_per_month']
df['is_rare_type'] = df['room_type'] == 'Shared room'
df['reviews_per_host'] = df['number_of_reviews'] / df['calculated_host_listings_count']
df['min_available'] = df['minimum_nights'] * df['availability_365']

In [8]:
target = df['price']
df.drop(columns=['price'], inplace=True)

target.replace(to_replace=0, value=target.mean(), inplace=True)
target = np.log(target)

In [9]:
%%time
to_encode = ['room_type', 'neighbourhood_group']

for col in to_encode:
    le = LabelEncoder()
    ohe = OneHotEncoder(categories='auto')
    labeled = le.fit_transform(df[col])
    labeled = labeled.reshape(len(labeled), 1)
    encoded = ohe.fit_transform(labeled).toarray()
    encoded_df = pd.DataFrame(
        encoded, columns=['is_'+cat for cat in le.classes_]
    )
    df = df.join(encoded_df)

df.drop(columns=to_encode, inplace=True)

Wall time: 205 ms


In [10]:
%%time
# Useless unique identifier
df.drop(columns='id', inplace=True)
# Have to make specific FE for sentences
df.drop(columns='name', inplace=True)

to_label = ['host_name', 'neighbourhood',
            'year_available', 'is_rare_type']

for col in to_label:
    le = LabelEncoder()
    labeled = le.fit_transform(df[col])
    df[col+'_label'] = labeled

df.drop(columns=to_label, inplace=True)

Wall time: 170 ms


In [11]:
num_df = df.select_dtypes(include=np.number)

scaler = StandardScaler()
scaled = scaler.fit_transform(df[num_df.columns])
df[num_df.columns] = pd.DataFrame(scaled)

In [12]:
# Train-test split
x, x_test, y, y_test = train_test_split(
    df, target, test_size=0.2, random_state=0, shuffle=True
)
# Train-val split
x_train, x_val, y_train, y_val = train_test_split(
    x, y, train_size=0.8, random_state=0, shuffle=True
)

In [18]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.metrics import mean_absolute_error
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.models import Sequential

optimizer = SGD(learning_rate=0.01)
optimizer = Adam(0.1)

model = Sequential([
    Dense(40, activation=relu, input_dim=df.shape[1]),
    Dense(50, activation=relu),
    Dense(50, activation=relu),
    Dense(20, activation=relu),
    Dense(1, activation=sigmoid)
])

In [19]:
model.compile(optimizer=optimizer, loss=mean_squared_error,
              metrics=[mean_absolute_error])

In [20]:
batch_size = 64
epochs = 10
history = model.fit(x_train, y_train,
                    batch_size=batch_size, epochs=epochs)

Train on 24844 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
train_mse, train_mae = model.evaluate(x_train, y_train,
                                      batch_size=batch_size)

y_pred_val = model.predict(x_val, batch_size=batch_size)
y_pred = model.predict(x_test, batch_size=batch_size)

val_mse = mean_squared_error(y_val, y_pred_val)
test_mse = mean_squared_error(y_test, y_pred)

print('Train MSE: {:.4f}\nValidation MSE: {:.4f}\nTest MSE: {:.4f}'.format(
    train_mse, val_mse.numpy().mean(), test_mse.numpy().mean()
))

Train MSE: 14.1318
Validation MSE: 14.0803
Test MSE: 14.0697
