In [None]:
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt
# import hvplot.dask
import numpy as np
import dask.array as da
import pickle
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [None]:
df = pd.read_csv("data/training_set_VU_DM.csv")

In [None]:
# 1. Probability if click then book for hotel, if search then click and if search then book

grouped_hotels = df.groupby("prop_id").sum(numeric_only=True)
nr_hotel_found = df["prop_id"].value_counts()
temp = pd.DataFrame()
temp["prob_clickbook"] = grouped_hotels["booking_bool"]/ grouped_hotels["click_bool"]
temp["prob_searchbook"] = grouped_hotels["booking_bool"]/nr_hotel_found
temp["prob_searchclick"] = grouped_hotels["click_bool"]/nr_hotel_found

temp.fillna(0,inplace=True)
temp =temp.reset_index()
df = df.merge(temp, on="prop_id")
del temp

In [None]:
# 2. Percentile rank of hotel in its country

grouped_hotels = df.groupby(["prop_id","prop_country_id"]).mean(numeric_only=True).reset_index()
temp = grouped_hotels[["prop_starrating", "prop_review_score","prop_id","prop_country_id"]].copy()
temp["total_score"] = temp.loc[:,"prop_starrating"] + temp.loc[:,"prop_review_score"]
temp["country_pct_rank"] = temp.groupby("prop_country_id")["total_score"].rank(method="dense",
                                                                               ascending=True,pct=True)
temp = temp[["prop_id","country_pct_rank"]]
df = df.merge(temp, on="prop_id")
del temp

In [None]:
# 3. Competitors: mean competitor avail/ rate + if there is a better competitor columns.


def competitor_cols(temp:pd.DataFrame, col_names:list,rate_or_avail:str):
    """
    Compute 2 columns for rate and availability. Boolean column if there exists
    a competitor with a better rate and mean competitor values.
    """
    temp_rate = temp[col_names].copy()
    temp_rate_binary = temp_rate.applymap(lambda x: 1 if not pd.isna(x) and x >= 0 else 0)
    temp_rate_binary.loc[:, f"comp_mean_{rate_or_avail}"] = temp_rate.mean(axis=1)
    temp_rate_binary[f"comp_better_{rate_or_avail}"] = temp_rate_binary.any(axis=1).astype(int) #
    temp_rate_binary.drop(columns=col_names,inplace=True)
    return temp_rate_binary

inv_cols = [f"comp{i}_inv" for i in np.arange(1,9)]
rate_cols = [f"comp{i}_rate" for i in np.arange(1,9)]
rate_perc_cols = [f"comp{i}_rate_percent_diff" for i in np.arange(1,9)]

temp = df.groupby("prop_id").mean(numeric_only=True)
rate_res = competitor_cols(temp, rate_cols, "rate")
inv_res = competitor_cols(temp, inv_cols, "inv")
temp = pd.concat([rate_res, inv_res]) # only new columns
df = df.merge(temp, on="prop_id")

In [None]:
# drop original competitor columns
df.drop(columns=rate_cols+inv_cols+rate_perc_cols, inplace=True)

In [None]:
# 5. Price change in percentile, no NaN so don't need to average
temp = pd.DataFrame()
old_price = np.exp(df["prop_log_historical_price"])
df["price_change"] = (df["price_usd"] - old_price) / old_price


In [None]:
# 6. Desirability_score as function of location and affinity score

# Compute the desirability score as a weighted average of the four measures
desirability_score = df["srch_query_affinity_score"] *  (df["prop_location_score1"] + df["prop_location_score2"])
desirability_score.fillna(desirability_score.mean(), inplace=True)
df["desirability_score"] = desirability_score


In [None]:
# 7. Date time to percentile of the day
df["date_time"] = pd.to_datetime(df["date_time"])
df['percentile_of_day'] = (df['date_time'].dt.hour * 3600 + df['date_time'].dt.minute * 60 + df['date_time'].dt.second) / 86400
df.drop(columns=["date_time"], inplace=True)


In [None]:
# encode categorical labels

cat_cols = ["prop_id","srch_destination_id", "prop_country_id", "visitor_location_country_id", "site_id"]
encoder = LabelEncoder()

for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])