In [1]:
#!pip install scikit-misc

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from skmisc.loess import loess, loess_prediction

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
pd.options.display.max_columns = None

In [3]:
HOME = Path().resolve().parent.parent
data_dir = str(HOME) + '/data/processed/'

In [4]:
listings = pd.read_csv(data_dir + 'listings_transformed.csv.gz')
listings.shape

(458795, 25)

In [5]:
listings.head()

Unnamed: 0,availability_60,CasosCovMesAnt,neighbourhood_group_cleansed,longitude,beds,id,bedrooms,accommodates,room_type,number_of_reviews,CasosCovMes,file_date,price,availability_90,amenities,latitude,availability_30,availability_365,review_scores_rating,bathrooms,reviews_per_month,property_type,month,year,is_high_season
0,0,0.0,Sant Martí,2.185545,4,18666,2,6,Entire home/apt,1,0.0,1901,130.0,0,18,41.408886,0,6,80.0,1,0.03,Apartment,1,2019,0
1,27,0.0,Eixample,2.173058,6,18674,3,8,Entire home/apt,5,0.0,1901,60.0,55,22,41.404197,12,326,85.0,2,0.07,Apartment,1,2019,0
2,23,0.0,Horta-Guinardó,2.170701,2,31377,1,2,Private room,4,0.0,1901,42.0,30,26,41.410969,14,184,95.0,1,0.09,Apartment,1,2019,0
3,38,0.0,Horta-Guinardó,2.170819,2,31380,1,3,Private room,39,0.0,1901,53.0,47,26,41.4109,24,204,87.0,1,0.9,Apartment,1,2019,0
4,52,0.0,Gràcia,2.159376,1,31958,1,4,Entire home/apt,151,0.0,1901,60.0,80,10,41.409498,25,342,91.0,1,1.57,Apartment,1,2019,0


### 1. Preparing the dataset to make predictions

In [30]:
useful_features = ['year', 'month', 'CasosCovMes', 'CasosCovMesAnt', 
                  'is_high_season', 'availability_30', 'availability_60']

In [31]:
df = listings[useful_features].groupby(['year', 'month'], as_index=False).mean()

timeline = df['year']*12 + df['month']
df.insert(2, 'monthly_timeline', timeline)

### 3. Preprocessing data

In [32]:
# This step takes some minutes to finish.

df.loc['av60_lag1'] = df['availability_60'].shift(1)
df.loc['av60_lag2'] = df['availability_60'].shift(2)

In [33]:
df2 = df.dropna().copy()

In [34]:
df2.shape

(28, 8)

### 2. Split train and test set

In [37]:
train = df2[(df2['monthly_timeline'] >= 24243) & (df2['monthly_timeline'] <= 24255)].copy()
test = df2[df2['monthly_timeline'] > 24255].copy()

In [46]:
min_timeline = train['monthly_timeline'].min()
X_train, y_train = train['monthly_timeline'].map(lambda x: x - min_timeline), train['availability_60'].map(lambda x: np.log10(x) if x > 0 else 0)
X_test, y_test = test['monthly_timeline'].map(lambda x: x - min_timeline), test['availability_60'].map(lambda x: np.log10(x) if x > 0 else 0)

### 3. Standardization and Normalization

In [16]:
transformer = RobustScaler()
#X_train_norm = transformer.fit_transform(X_train)
#X_test_norm = transformer.transform(X_test)

### 4. Train Loess model

In [49]:
lo = loess(X_train, y_train, span=0.1, p=1, surface='direct', degree=2)

In [50]:
lo.fit()

ValueError: b'svddc failed in l2fit.'

### 5. Predict and validate results

In [None]:
# Predict

train_pred = xgb_model.predict(X_train_norm)
test_pred = xgb_model.predict(X_test_norm)

In [None]:
# Compute RMSE

train_rmse = mean_squared_error(y_train, train_pred)
test_rmse = mean_squared_error(y_test, test_pred)

In [None]:
print(f'The Train RMSE is {train_rmse:.2f}\nThe Test RMSE is {test_rmse:.2f}')

Clearly, this approach has overfitting! We must try to reduce it...

### 6. Variable importance for the model

In [None]:
# Producing a dataframe of feature importances
features_weights = pd.DataFrame(xgb_model.feature_importances_, columns=['weight'], index=X_train.columns)
features_weights.sort_values('weight', inplace=True)

# Plotting feature importances
plt.figure(figsize=(8,5))
plt.barh(features_weights.index, features_weights.weight, align='center') 
plt.title("Feature importances in the XGBoost model", fontsize=14)
plt.xlabel("Feature importance")
plt.margins(y=0.01)
plt.show()