# Expedia Hotel Recommendations

Which hotel type will an Expedia customer book? Currently Expedia uses search parameters to adjust hotel recommendations of customers but there aren't any customer specific data to personalize them for each user. In the below analysis, we contextualize millions of rows of customer data and predict the likelihood a user will stay at 100 different hotel groups.

In [1]:
# Expedia Hotel Recommendation
# Submission 1
# 5/24/2016

In [37]:
#importing pandas for data frame operations
#Reading csv files 
import pandas as pd

destinations = pd.read_csv("destinations.csv")

In [38]:
destinations.head(5)

Unnamed: 0,srch_destination_id,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d140,d141,d142,d143,d144,d145,d146,d147,d148,d149
0,0,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-1.897627,-2.198657,-2.198657,-1.897627,...,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657
1,1,-2.18169,-2.18169,-2.18169,-2.082564,-2.18169,-2.165028,-2.18169,-2.18169,-2.031597,...,-2.165028,-2.18169,-2.165028,-2.18169,-2.18169,-2.165028,-2.18169,-2.18169,-2.18169,-2.18169
2,2,-2.18349,-2.224164,-2.224164,-2.189562,-2.105819,-2.075407,-2.224164,-2.118483,-2.140393,...,-2.224164,-2.224164,-2.196379,-2.224164,-2.192009,-2.224164,-2.224164,-2.224164,-2.224164,-2.057548
3,3,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.115485,-2.177409,-2.177409,-2.177409,...,-2.161081,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409
4,4,-2.189562,-2.187783,-2.194008,-2.171153,-2.152303,-2.056618,-2.194008,-2.194008,-2.145911,...,-2.187356,-2.194008,-2.191779,-2.194008,-2.194008,-2.185161,-2.194008,-2.194008,-2.194008,-2.188037


#### Loading the entire training dataset into memory

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv',
                    dtype={'is_booking':bool,'srch_destination_id':np.int32, 'hotel_cluster':np.int32,
                           'user_location_country':np.int32, 'user_location_region':np.int32, 'user_location_city':np.int32,
                          'orig_destination_distance':np.float64, 'hotel_market':np.int32},
                    usecols=['srch_destination_id','is_booking','hotel_cluster','user_location_country','user_location_region',
                            'user_location_city','orig_destination_distance','hotel_market'],
                    chunksize=1000000)

In [2]:
df=pd.DataFrame()

In [3]:
df = pd.concat(chunk for chunk in train)

In [4]:
df.shape

(37670293, 8)

#### Viewing the training dataset

In [5]:
df.head(5)

Unnamed: 0,user_location_country,user_location_region,user_location_city,orig_destination_distance,srch_destination_id,is_booking,hotel_market,hotel_cluster
0,66,348,48862,2234.2641,8250,False,628,1
1,66,348,48862,2234.2641,8250,True,628,1
2,66,348,48862,2234.2641,8250,False,628,1
3,66,442,35390,913.1932,14984,False,1457,80
4,66,442,35390,913.6259,14984,False,1457,21


In [6]:
train = df
train.dtypes

user_location_country          int32
user_location_region           int32
user_location_city             int32
orig_destination_distance    float64
srch_destination_id            int32
is_booking                      bool
hotel_market                   int32
hotel_cluster                  int32
dtype: object

#### Loading the testing dataset into memory

In [7]:
test = pd.read_csv("test.csv")

In [8]:
test.shape

(2528243, 22)

#### Viewing the testing dataset

In [9]:
test.head(5)

Unnamed: 0,id,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,...,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,hotel_continent,hotel_country,hotel_market
0,0,2015-09-03 17:09:54,2,3,66,174,37449,5539.0567,1,1,...,2016-05-19,2016-05-23,2,0,1,12243,6,6,204,27
1,1,2015-09-24 17:38:35,2,3,66,174,37449,5873.2923,1,1,...,2016-05-12,2016-05-15,2,0,1,14474,7,6,204,1540
2,2,2015-06-07 15:53:02,2,3,66,142,17440,3975.9776,20,0,...,2015-07-26,2015-07-27,4,0,1,11353,1,2,50,699
3,3,2015-09-14 14:49:10,2,3,66,258,34156,1508.5975,28,0,...,2015-09-14,2015-09-16,2,0,1,8250,1,2,50,628
4,4,2015-07-17 09:32:04,2,3,66,467,36345,66.7913,50,0,...,2015-07-22,2015-07-23,2,0,1,11812,1,2,50,538


#### Algorithm for finding suitable clusters

In [26]:
def make_key(items):
    return "_".join([str(i) for i in items])

match_cols = ["srch_destination_id"]
cluster_cols = match_cols + ['hotel_cluster']
groups = train.groupby(cluster_cols)
top_clusters = {}
for name, group in groups:
    clicks = len(group.is_booking[group.is_booking == False])
    bookings = len(group.is_booking[group.is_booking == True])
    
    score = bookings + .086 * clicks    
    
    clus_name = make_key(name[:len(match_cols)])
    if clus_name not in top_clusters:
        top_clusters[clus_name] = {}
    top_clusters[clus_name][name[-1]] = score
   

#### Finding top 5 clusters for each search destination

In [28]:
import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top

#### Making predictions based on top 5 clusters for each destination

In [14]:
preds = []
for index, row in test.iterrows():
    key = make_key([row[m] for m in match_cols])
    if key in cluster_dict:
        preds.append(cluster_dict[key])
    else:
        preds.append([])

#### Error Calculation

In [24]:
#Calculate error
#Implementation from ml_metrics package by Ben Hamner
#Error calculation is useful only while building the model and not for the entire test dataset

import numpy as np

def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):    
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

mapk([[l] for l in test["hotel_cluster"]], preds, k=5)

#### Making an initial submission file

In [15]:
#Making a submission file

write_p = [" ".join([str(l) for l in p]) for p in preds]
write_frame = ["{0},{1}".format(test["id"][i], write_p[i]) for i in range(len(preds))]
write_frame = ["id,hotel_cluster"] + write_frame
with open("predictions.csv", "w+") as f:
    f.write("\n".join(write_frame))

In [16]:
predictions = pd.read_csv('predictions.csv')

In [17]:
predictions.shape

(2528243, 2)

#### Prediction file sample

In [19]:
predictions.head(5)

Unnamed: 0,id,hotel_cluster
0,0,5 37 55 11 22
1,1,5
2,2,0 31 96 91 77
3,3,1 45 79 24 54
4,4,91 42 2 48 59


#### Improving Predictions

In [30]:
#Improving predictions
#should be a new training dataset containing all columns
#Finding common columns in training and testing datasets

match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance']

groups = train.groupby(match_cols)
    
def generate_exact_matches(row, match_cols):
    index = tuple([row[t] for t in match_cols])
    try:
        group = groups.get_group(index)
    except Exception:
        return []
    clus = list(set(group.hotel_cluster))
    return clus

exact_matches = []
for i in range(test.shape[0]):
    exact_matches.append(generate_exact_matches(test.iloc[i], match_cols))

In [39]:
#Most common clusters based on value count

most_common_clusters = list(train.hotel_cluster.value_counts().head().index)

In [41]:
#Full predictions as a combination of all the above predictions

def f5(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result
    
full_preds = [f5(exact_matches[p] + preds[p] + most_common_clusters)[:5] for p in range(len(preds))]

#### Making final submission file

In [43]:
#Making a submission file

write_p = [" ".join([str(l) for l in p]) for p in full_preds]
write_frame = ["{0},{1}".format(test["id"][i], write_p[i]) for i in range(len(full_preds))]
write_frame = ["id,hotel_cluster"] + write_frame
with open("predictions.csv", "w+") as f:
    f.write("\n".join(write_frame))

In [44]:
predictions = pd.read_csv('predictions.csv')

In [45]:
predictions.shape

(2528243, 2)

#### Final Prediction of first 5 hotel id groups

In [46]:
predictions.head(5)

Unnamed: 0,id,hotel_cluster
0,0,5 37 55 11 22
1,1,5 91 41 48 64
2,2,91 0 31 96 77
3,3,1 45 79 24 54
4,4,50 51 91 42 2


#### Conclusions

Hotel groups have been predicted above. For the full list of hotel groups, check out the predictions file generated from the above algorithm.