# ***TASK 2: DATA UNDERSTANDING***

**Load Data**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv ('training_set_VU_DM.csv')
df

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,,0.0,0.0,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,,,219,77700,3,4.0,...,,,,,,,,0,,0
4958343,332785,2013-06-30 19:55:18,5,219,,,219,88083,3,4.0,...,,,,,,,,0,,0
4958344,332785,2013-06-30 19:55:18,5,219,,,219,94508,3,3.5,...,,,,,,,,0,,0
4958345,332785,2013-06-30 19:55:18,5,219,,,219,128360,3,5.0,...,,,,,,,,1,157.84,1


In [None]:
df.describe()

**Count and remove Nan-values**

In [None]:
nan_index = df.isna().mean().loc[lambda x : x>0.25]
nan_index = list(nan_index.keys())
print(nan_index)
nan_columns = df[nan_index]
nan_columns = nan_columns.astype('float32')

In [None]:
perc = 1.0
min_count =  int(((100-perc)/100)*df.shape[0] + 1)
df = df.dropna(axis=1, thresh=min_count)

In [None]:
print(df.isna().mean().loc[lambda x : x>0.00000000000000001])

In [None]:
count = 0
non_nan = 0
d = {}
for column in df:
  perc_NaN = df[column].isna().sum() / len(df[column])
  d[column] = perc_NaN
  if perc_NaN > 0.5:
    count += 1
    df = df.drop(columns=column)
  else:
    non_nan += 1
print(count, non_nan)

In [None]:
sorted_tuples = sorted(d.items(), key=lambda item: item[1])
d = {k: v for k, v in sorted_tuples}
f, ax = plt.subplots(figsize=(18,5))
plt.bar(range(len(d)), list(d.values()), align='center')
plt.xticks(range(len(d)), list(d.keys()), rotation = 90)

plt.show()

**Correlations**

In [None]:
correlation = df.corr()
plt.figure(figsize=(18, 18))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='viridis')
plt.title('Correlation between different features')

**Property values**

In [None]:
df_prop = df.copy()
remove_list = ['srch_id', 'date_time', 'site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'prop_brand_bool', 'position', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_saturday_night_bool', 'random_bool', 'click_bool', 'booking_bool']
prop_list = ['srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'orig_destination_distance']
df_prop = df_prop.drop(columns=remove_list)
df_prop = df_prop.drop(columns=prop_list)

In [None]:
colors = ['orchid','blue','yellow','red', 'orange']
fig,ax = plt.subplots(figsize=(12,9)) 
ax,props = df_prop.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch,color in zip(props['boxes'],colors):
    patch.set_facecolor(color)
plt.show()

**Length of stay**

In [None]:
fig,ax = plt.subplots(figsize=(18,5)) 
boxplot = df.boxplot(column='srch_length_of_stay', patch_artist=True, vert=False)

**Booking window**

In [None]:
fig,ax = plt.subplots(figsize=(18,5)) 
boxplot = df.boxplot(column='srch_booking_window', patch_artist=True, vert=False)

**Site, visitor location, property location**

In [None]:
id_list = ['site_id', 'visitor_location_country_id', 'prop_country_id']
for id in id_list:
  n, bins, patches = plt.hist(df[id], 100, density = 1, facecolor='blue', alpha=0.75)
  plt.xlabel(id)
  plt.title('Histogram of '+id)
  plt.show();

**Booking and non-booking price**

In [None]:
sns.set(style="ticks", palette="deep")
ax = sns.boxplot(x="booking_bool", y='price_usd', hue='booking_bool', data=df)
ax.set_ylim([0, 200]);

**Click and non-click price**

In [None]:
sns.set(style="ticks", palette="pastel")
ax = sns.boxplot(x="click_bool", y="price_usd", hue="click_bool", data=df)
ax.set_ylim([0, 200]);

**Destination distance**

In [None]:
df['orig_destination_distance_range'] = pd.cut(
    df['orig_destination_distance'],
    bins= [0,100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,11667],
    labels=['0-100', '101-200','201-300',
            '301-400', '401-500','501-600',
            '601-700', '701-800','801-900',
            '901-1000', '1001-1100','1101-1200',
            '1201-1300', '1301-1400', '>1400']
)

In [None]:
count_classes = pd.value_counts(df['orig_destination_distance_range'], sort = True).sort_index()/len(df)
f, ax = plt.subplots(figsize=(16,4))
count_classes.plot(kind = 'bar', color='aqua')
plt.title("Destination Distance Range")
plt.ylabel("Percentage")

**Price range**

In [None]:
df['price_usd_range'] = pd.cut(
    df['price_usd'],
    bins= [0,50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,19726329],
    labels=['0-50', '51-100','100-150', 
            '151-200', '201-250', '251-300',
            '301-350', '351-400', '401-450', 
            '451-500', '501-550', '551-600', 
            '601-650','651-700', '701-750',
            '751-800', '801-850', '>850']
)

In [None]:
count_classes = pd.value_counts(df['price_usd_range'], sort = True).sort_index()/len(df)
f, ax = plt.subplots(figsize=(16,4))
count_classes.plot(kind = 'bar', color='deeppink')
plt.title("Price Range")
plt.ylabel("Percentage")

**Number of adults and children**

In [None]:
count_classes = pd.value_counts(df['srch_adults_count'], sort = True).sort_index()/len(df)
f, ax = plt.subplots(figsize=(9,4))
count_classes.plot(kind = 'bar', color='red')
plt.title("Adults Count")
plt.ylabel("Percentage")

In [None]:
count_classes = pd.value_counts(df['srch_children_count'], sort = True).sort_index()/len(df)
f, ax = plt.subplots(figsize=(9,4))
count_classes.plot(kind = 'bar', color='green')
plt.title("Children Count")
plt.ylabel("Percentage")

In [None]:
df['family'] = df['srch_adults_count'].astype(str) +"-"+ df['srch_children_count'].astype(str)
f, ax = plt.subplots(figsize=(16,4))
count_classes = pd.value_counts(df['family'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color = 'blue')
plt.title("Adult-Children")
plt.ylabel("Percentage")

**Position in query**

In [None]:
count_classes = pd.value_counts(df['position'], sort = True).sort_index()/len(df)
f, ax = plt.subplots(figsize=(16,4))
count_classes.plot(kind = 'bar', color='teal')
plt.title("Position")
plt.ylabel("Percentage")

In [None]:
df_sorted_booked = df[(df['random_bool'] == 0) & (df['booking_bool'] == 1)]
df_sorted_clicked = df[(df['random_bool'] == 0) & (df['click_bool'] == 1)]

sns.histplot(data=df_sorted_booked, x="position", color="red", label="booking", discrete=True, alpha=1)
sns.histplot(data=df_sorted_clicked, x="position", color="blue", label="clicked", discrete=True, alpha=0.3)

plt.legend()

In [None]:
df_random_booked = df[(df['random_bool'] == 1) & (df['booking_bool'] == 1)]
df_random_clicked = df[(df['random_bool'] == 1) & (df['click_bool'] == 1)]

sns.histplot(data=df_random_booked, x="position", color="red", label="booking", discrete=True, alpha=1)
sns.histplot(data=df_random_clicked, x="position", color="blue", label="clicked", discrete=True, alpha=0.3)

plt.legend()

**Promotion**

In [None]:
count_classes = pd.value_counts(df['promotion_flag'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color='darkred')
plt.title("Promotion Flag or Not")
plt.ylabel("Percentage")

**Independent property or belonging to chain**

In [None]:
count_classes = pd.value_counts(df['prop_brand_bool'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color='navy')
plt.title("Independent hotel or Not")
plt.ylabel("Percentage")

**Saturday night**

In [None]:
count_classes = pd.value_counts(df['srch_saturday_night_bool'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color='olive')
plt.title("Suturday night or Not")
plt.ylabel("Percentage")

**Random or sorted**

In [None]:
count_classes = pd.value_counts(df['random_bool'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color='lavender')
plt.title("Sorted or Not")
plt.ylabel("Percentage")

**Searches over time periods**

In [None]:
df['date'] = df['date_time'].str.split().str[0]
df['hour'] = df['date_time'].str.split().str[1]

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.weekday
df['day'] = df['date'].dt.day

df['hour'] = pd.to_datetime(df['hour'], errors='coerce')
df['hour'] = df['hour'].dt.hour

df['month-year'] = df['month'].astype(str) +"-"+ df['year'].astype(str)

In [None]:
count_classes = pd.value_counts(df['weekday'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color = 'blue')
plt.title("Searches per weekday")
plt.ylabel("Percentage")

In [None]:
count_classes = pd.value_counts(df['hour'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color = 'orange')
plt.title("Searches per hour")
plt.ylabel("Percentage")

In [None]:
count_classes = pd.value_counts(df['month'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color = 'green')
plt.title("Searches per month")
plt.ylabel("Percentage")

In [None]:
count_classes = pd.value_counts(df['month-year'], sort = True).sort_index()/len(df)
count_classes.plot(kind = 'bar', color = 'lightcoral')
plt.title("Searches per month")
plt.ylabel("Percentage")

# ***TASK 3: DATA PREPARATION***

## Imputation:

**Random sample**

In [None]:
import random

def RandomImputation(df2, column_name):
    df = df2.copy()
    for col in df.columns:
        data = df[column_name]
        mask = data.isnull()
        samples = random.choices( data[~mask].values , k = mask.sum() )
        data[mask] = samples
    return df

**Mean**

In [None]:
def MeanImputation(df2, column_name):
  df = df2.copy()
  df[column_name].fillna((df[column_name].mean()), inplace=True)
  return df

**Median**

In [None]:
def MedianImputation(df2, column_name):
  df = df2.copy()
  df[column_name].fillna((df[column_name].median()), inplace=True)
  return df

**KNN**

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_test = pd.DataFrame(scaler.fit_transform(df_test), columns = df_test.columns)
df_test.head()

In [None]:
from sklearn.impute import KNNImputer

def KNNImputation(df2, column_name):
  df = df2.copy()
  #df = df.drop(columns='date_time')
  imputer = KNNImputer(n_neighbors=5)
  df = imputer.fit_transform(df)
  return df

**MICE**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler

imputer = IterativeImputer(estimator=LinearRegression(), random_state=42, n_nearest_features=None, max_iter=5, verbose=2, 
                           initial_strategy="median", tol=1e-8)
scaler = MinMaxScaler()

test_df = pd.read_csv('test_set_VU_DM.csv')

perc = 25.0
min_count =  int(((100-perc)/100)*test_df.shape[0] + 1)
test_samples = test_df.dropna(axis=1, thresh=min_count)

train_samples = df.drop(["srch_id", "date_time", "position", "click_bool", "booking_bool"],
                        axis=1)
train_cat = train_samples[["site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"]]
train_samples = train_samples.drop(["site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"],
                        axis=1)
test_samples = test_samples.drop(["srch_id", "date_time", "site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"], axis=1)

print(len(train_samples.columns))
print(len(test_samples.columns))

train_samples = train_samples.astype('float32')
test_samples = test_samples.astype('float32')

combined_samples = pd.concat([train_samples, test_samples], sort=False, axis=0)
combined_samples = pd.DataFrame(scaler.fit_transform(combined_samples), columns = combined_samples.columns)

imputer = imputer.fit(combined_samples)
train_samples = pd.DataFrame(scaler.transform(train_samples), columns = train_samples.columns)
train_samples = pd.DataFrame(imputer.transform(train_samples), columns = train_samples.columns)
train_samples = pd.concat([train_cat, train_samples, nan_columns], sort=False, axis=1)

In [None]:
train_samples

**Test**

In [None]:
missing_value_columns = ['prop_location_score2', 'orig_destination_distance']

for column in missing_value_columns:
    df_new = MeanImputation(df, column)

## Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_test = df_new.drop(["srch_id", "date_time", "position", "click_bool", "booking_bool"],
                        axis=1)
df_test = pd.DataFrame(scaler.fit_transform(df_test), columns = df_test.columns)
df_new = pd.concat([df_new[["srch_id", "date_time", "position", "click_bool", "booking_bool"]], df_test], axis=1)

## Balancing

In [None]:
not_click_indices = df[df.click_bool == 0].index
not_click_indices = np.random.choice(not_click_indices, len(df.loc[df.click_bool == 0]) - len(df.loc[df.click_bool == 1]), replace=False)
df_new = df.drop(df.index[not_click_indices])

print("Percentage of not click impressions: ", len(df_new[df_new.click_bool == 0])/len(df_new))
print("Percentage of click impression: ", len(df_new[df_new.click_bool == 1])/len(df_new))
print("Total number of records in resampled data: ", len(df_new))

## Feature engineering

In [None]:
# Get year, month, day 
df_date = df['date_time'].str.split().str[0]
df_hour = df['date_time'].str.split().str[1]

df_date = pd.to_datetime(df_date, errors='coerce')
#df['year'] = df['date'].dt.year
df['month'] = df_date.dt.month
df['weekday'] = df_date.dt.weekday
df['day'] = df_date.dt.day

df_hour = pd.to_datetime(df_hour, errors='coerce')
df['hour'] = df_hour.dt.hour

In [None]:
# Count previous bookings per property id, and count previous clicks per property id
count_booking = {}
count_clicks = {}
count_prop = {}
for index, row in df.iterrows():
    if row['prop_id'] not in count_prop:
        count_prop[row["prop_id"]] = 1
    elif row['prop_id'] in count_prop:
        count_prop[row["prop_id"]] += 1
    if row["prop_id"] not in count_booking:
        count_booking[row["prop_id"]] = row['booking_bool']
    elif row["prop_id"] in count_booking:
        count_booking[row["prop_id"]] += row['booking_bool']
    if row["prop_id"] not in count_clicks:
        count_clicks[row["prop_id"]] = row['click_bool']
    elif row["prop_id"] in count_clicks:
        count_clicks[row["prop_id"]] += row['click_bool']
        
df['count_booked'] = None
df['count_clicked'] = None
df['count_booked_per_prop'] = None
df['count_clicked_per_prop'] = None 
for index, row in df.iterrows():
    df.at[index, 'count_booked'] = count_booking[row['prop_id']]
    df.at[index, 'count_clicked'] = count_clicks[row['prop_id']]
    df.at[index, 'count_booked_per_prop'] = count_booking[row['prop_id']] / count_prop[row['prop_id']]
    df.at[index, 'count_clicked_per_prop'] = count_clicks[row['prop_id']] / count_prop[row['prop_id']]

In [None]:
# Difference between historical star rating and current search 
df['star_diff'] = abs(df['visitor_hist_starrating'] - df['prop_starrating'])

# Difference between historical price per night and current price of search
df['price_diff'] = abs(df['visitor_hist_adr_usd'] - df['price_usd'])

In [None]:
# Total fee is price usd * room count
df['total_fee'] = df['price_usd'] * df['srch_room_count']

In [None]:
# Price per adult is (price usd * room count)/ adult count
df['price_per_person'] = (df['price_usd'] * df['srch_room_count']) / df['srch_adults_count']

**Normalized features**

In [3]:
from sklearn.preprocessing import MinMaxScaler

def scale(values):
    scaler = MinMaxScaler()
    return scaler.fit_transform(values)

def add_normalized_features(df):
    indicators = ["srch_id", "site_id", "visitor_location_country_id", "prop_country_id", "prop_starrating", "srch_destination_id", "month", "weekday"]
    numerical_variables = ["visitor_hist_starrating", "visitor_hist_adr_usd", "prop_review_score", "prop_location_score1", "prop_location_score2",
                           "prop_log_historical_price", "price_usd", "srch_length_of_stay", "srch_booking_window", "srch_query_affinity_score",
                           "orig_destination_distance"]

    df = df.drop(["date_time"], axis=1)
    df = df.astype('float32')
    for indicator in indicators:
        print(indicator)
        for variable in numerical_variables:
            df[indicator + "_" + variable] = df.groupby(indicator)[variable].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
            
    return df

srch_id
site_id
visitor_location_country_id
prop_country_id
prop_starrating
srch_destination_id


In [None]:
df = add_normalized_features(df)

# ***TASK 4: Modeling***

**Define model**

In [27]:
import lightgbm

model = lightgbm.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    num_leaves=50,
    boosting="gbdt",
    num_iterations=830,
    learning_rate=0.1,
    max_bin=255,
    min_data_in_leaf=0,
    min_sum_hessian_in_leaf=300,
    max_depth=10,
    label_gain=[0,1,1,1,1,1],
    lambdarank_norm=True,
    lambdarank_truncation_level=5,
    #colsample_bytree=0.9, 
    #reg_alpha=2,
    #reg_lambda=5,
    #bagging_fraction=0.5,
    #bagging_freq=10,
    feature_fraction=0.2
)

In [None]:
import xgboost as xgb
neg_amount = len(df[df.click_bool == 0])
pos_amount = len(df) - neg_amount
scale_pos_weight = neg_amount / pos_amount

params = {'objective': 'rank:ndcg', 'n_estimators': 1000, 'max_depth': 50, 'eval_metric': 'ndcg@5',
          'enable_categorical': True, 'max_leaves': 100, 'tree_method': 'hist', 'learning_rate': 0.1,
          'booster': 'gbtree', 'random_state': 42, 'max_bin': 200, 'min_child_weight': 100, 'subsample': 1.0,
          'grow_policy': 'lossguide', 'lambda': 0.0, 'scale_pos_weight': 21}
model = xgb.sklearn.XGBRanker(**params)

## Validation:

**Get train/val split**

In [5]:
from sklearn.model_selection import GroupShuffleSplit

df = df.astype({'site_id': 'category', 'visitor_location_country_id': 'category', 'prop_country_id': 'category', 'prop_id': 'category', 'srch_destination_id': 'category'})

splitter = GroupShuffleSplit(test_size=.10, n_splits=1, random_state=42)
split = splitter.split(df, groups=df['srch_id'])
train_inds, val_inds = next(split)

train_df = df.iloc[train_inds]
validation_df = df.iloc[val_inds]

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)

qids_train = train_df.groupby("srch_id")["srch_id"].count().to_numpy()
y_train = train_df["booking_bool"] * 4 + train_df["click_bool"]

qids_validation = validation_df.groupby("srch_id")["srch_id"].count().to_numpy()
y_validation = validation_df["booking_bool"] * 4 + validation_df["click_bool"]

X_train = train_df.drop(["srch_id", "position", "click_bool", "booking_bool", "gross_bookings_usd"], axis=1)
X_validation = validation_df.drop(["srch_id", "position", "click_bool", "booking_bool", "gross_bookings_usd"], axis=1)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import ExtraTreesRegressor

imputer = IterativeImputer(estimator=LinearRegression, random_state=42, n_nearest_features=None, max_iter=5, verbose=2, 
                           initial_strategy="median", tol=1e-5)

splitter = GroupShuffleSplit(test_size=.10, n_splits=1, random_state=42)
split = splitter.split(df, groups=df['srch_id'])
train_inds, val_inds = next(split)

train_df = df.iloc[train_inds]
validation_df = df.iloc[val_inds]

train_nan = nan_columns.iloc[train_inds]
validation_nan = nan_columns.iloc[val_inds]

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)

train_nan = train_nan.reset_index(drop=True)
validation_nan = validation_nan.reset_index(drop=True)

qids_train = train_df.groupby("srch_id")["srch_id"].count().to_numpy()
y_train = train_df["booking_bool"] * 4 + train_df["click_bool"]

qids_validation = validation_df.groupby("srch_id")["srch_id"].count().to_numpy()
y_validation = validation_df["booking_bool"] * 4 + validation_df["click_bool"]

train_df = train_df.drop(["srch_id", "date_time", "position", "click_bool", "booking_bool"],
                        axis=1)
train_df = train_df.astype('float32')
X_train = pd.DataFrame(imputer.fit_transform(train_df), columns = train_df.columns)
#X_train = pd.concat([X_train, train_nan], sort=False, axis=1)

validation_df = validation_df.drop(["srch_id", "date_time", "position", "click_bool", "booking_bool"],
                        axis=1)
validation_df = validation_df.astype('float32')
X_validation = pd.DataFrame(imputer.transform(validation_df), columns = validation_df.columns)
#X_validation = pd.concat([X_validation, validation_nan], sort=False, axis=1)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GroupShuffleSplit

test_df = pd.read_csv('test_set_VU_DM.csv')

perc = 50.0
min_count =  int(((100-perc)/100)*test_df.shape[0] + 1)
test_df = test_df.dropna(axis=1, thresh=min_count)

imputer = IterativeImputer(estimator=LinearRegression(), random_state=42, n_nearest_features=None, max_iter=5, verbose=2, 
                           initial_strategy="median", tol=1e-5)
samples = df.drop(["srch_id", "date_time", "position", "click_bool", "booking_bool"],
                        axis=1)
test_samples = test_df.drop(["srch_id", "date_time"], axis=1)
combined_samples = pd.concat([samples, test_samples], sort=False, axis=0)
combined_samples = combined_samples.astype('float32')
imputer = imputer.fit(combined_samples)
samples = pd.DataFrame(imputer.transform(samples), columns = samples.columns)

splitter = GroupShuffleSplit(test_size=.10, n_splits=1, random_state=42)
split = splitter.split(df, groups=df['srch_id'])
train_inds, val_inds = next(split)

train_df = df.iloc[train_inds]
validation_df = df.iloc[val_inds]

X_train = samples.iloc[train_inds]
X_validation = samples.iloc[val_inds]

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)

X_train = X_train.reset_index(drop=True)
X_validation = X_validation.reset_index(drop=True)

qids_train = train_df.groupby("srch_id")["srch_id"].count().to_numpy()
y_train = train_df["booking_bool"] * 4 + train_df["click_bool"]

qids_validation = validation_df.groupby("srch_id")["srch_id"].count().to_numpy()
y_validation = validation_df["booking_bool"] * 4 + validation_df["click_bool"]

**Fit model**

In [28]:
model.fit(
    X=X_train,
    y=y_train,
    categorical_feature=["site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"],
    group=qids_train,
    eval_set=[(X_validation, y_validation)],
    eval_group=[qids_validation],
    eval_at=5,
    verbose=10,
)

[10]	valid_0's ndcg@5: 0.36022
[20]	valid_0's ndcg@5: 0.375879
[30]	valid_0's ndcg@5: 0.385514
[40]	valid_0's ndcg@5: 0.389175
[50]	valid_0's ndcg@5: 0.392625
[60]	valid_0's ndcg@5: 0.395546
[70]	valid_0's ndcg@5: 0.397566
[80]	valid_0's ndcg@5: 0.39999
[90]	valid_0's ndcg@5: 0.401058
[100]	valid_0's ndcg@5: 0.402836
[110]	valid_0's ndcg@5: 0.404034
[120]	valid_0's ndcg@5: 0.404378
[130]	valid_0's ndcg@5: 0.404785
[140]	valid_0's ndcg@5: 0.40516
[150]	valid_0's ndcg@5: 0.405247
[160]	valid_0's ndcg@5: 0.406551
[170]	valid_0's ndcg@5: 0.405858
[180]	valid_0's ndcg@5: 0.406357
[190]	valid_0's ndcg@5: 0.406795
[200]	valid_0's ndcg@5: 0.407326
[210]	valid_0's ndcg@5: 0.407846
[220]	valid_0's ndcg@5: 0.408053
[230]	valid_0's ndcg@5: 0.408561
[240]	valid_0's ndcg@5: 0.409066
[250]	valid_0's ndcg@5: 0.40887
[260]	valid_0's ndcg@5: 0.408641
[270]	valid_0's ndcg@5: 0.408789
[280]	valid_0's ndcg@5: 0.408757
[290]	valid_0's ndcg@5: 0.40916
[300]	valid_0's ndcg@5: 0.409469
[310]	valid_0's ndcg@5: 

LGBMRanker(boosting='gbdt', feature_fraction=0.2, label_gain=[0, 1, 1, 1, 1, 1],
           lambdarank_norm=True, lambdarank_truncation_level=5, max_bin=255,
           max_depth=10, metric='ndcg', min_data_in_leaf=0,
           min_sum_hessian_in_leaf=300, num_iterations=930, num_leaves=50,
           objective='lambdarank')

### **xgboost**

In [None]:
model.fit(X_train, 
          y_train, 
          group=qids_train, 
          verbose=True,
          eval_set=[(X_validation, y_validation)], 
          eval_group=[qids_validation])

**SVR**

In [None]:
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy='mean')
X_train = imp.fit_transform(X_train)
model = SVR(kernel='rbf', C=10, gamma=0.1, verbose=True)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import ndcg_score

predictions = model.predict(X=X_validation)

ndcg = np.zeros(len(qids_validation))
for i, q_id in enumerate(qids_validation):
    queries_covered = np.arange(i)
    searches_covered = qids_validation[queries_covered].sum()
    ndcg[i] = ndcg_score([y_validation[searches_covered:searches_covered + q_id]], [predictions[searches_covered:searches_covered + q_id]], k=5)
    
print(np.mean(ndcg))

## Testing:

**Get all train & test data**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

test_df = pd.read_csv('test_set_VU_DM.csv')
test_df = add_normalized_features(test_df)

qids_train = df.groupby("srch_id")["srch_id"].count().to_numpy()
X_train = df.drop(["srch_id", "position", "click_bool", "booking_bool", "gross_bookings_usd"],
                        axis=1)
y_train = df["booking_bool"] * 4 + df["click_bool"]

qids_test = test_df.groupby("srch_id")["srch_id"].count().to_numpy()
X_test = test_df.drop(["srch_id"], axis=1)

**fit model**

In [None]:
model.fit(
    X=X_train,
    y=y_train,
    categorical_feature=["site_id", "visitor_location_country_id", "prop_country_id", "prop_id", "srch_destination_id"],
    group=qids_train,
)

**Make predictions and write to csv**

In [None]:
import csv
from operator import itemgetter

predictions = model.predict(X=X_test, num_iteration=0)
predictions = list(-1.0 * predictions)

recommendations = zip(test_df["srch_id"], test_df["prop_id"], predictions)
rows = [(srch_id, prop_id)
        for srch_id, prop_id, rank_float
        in sorted(recommendations, key=itemgetter(0, 2))]
writer = csv.writer(open("submission.csv", "w"), lineterminator="\n")
writer.writerow(("srch_id", "prop_id"))
writer.writerows(rows)