In [141]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

In [78]:
train_data = pd.read_csv("./dataset/train.csv", sep = ",", nrows = 100000)

In [79]:
train_data.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21


In [80]:
train_data.shape

(100000, 24)

In [81]:
new_df = train_data[['user_id', 'hotel_cluster', 'is_booking']]

In [82]:
new_df.head()

Unnamed: 0,user_id,hotel_cluster,is_booking
0,12,1,0
1,12,1,1
2,12,1,0
3,93,80,0
4,93,21,0


In [83]:
new_df.columns = ['user_id', 'hotel_id', 'booking']
new_df = new_df.sort_values(['user_id', 'hotel_id'])
new_df = new_df.reset_index().drop('index', axis = 1)
new_df.head()

Unnamed: 0,user_id,hotel_id,booking
0,12,1,0
1,12,1,1
2,12,1,0
3,93,21,0
4,93,80,0


In [84]:
booked_hotels = new_df.groupby(['user_id', 'hotel_id']).booking.transform(max)
new_df = new_df.loc[new_df.booking == booked_hotels]
new_df.drop_duplicates(keep = 'first', inplace = True)
new_df= new_df.reset_index().drop('index', axis = 1)
new_df.head()

Unnamed: 0,user_id,hotel_id,booking
0,12,1,1
1,93,21,0
2,93,80,0
3,93,92,0
4,501,10,0


## Calculated each of the hotel cluster's mean

In [85]:
bookings = pd.DataFrame(new_df.groupby(['hotel_id'])['booking'].mean())
bookings

Unnamed: 0_level_0,booking
hotel_id,Unnamed: 1_level_1
0,0.117794
1,0.252396
2,0.209877
3,0.113043
4,0.184035
...,...
95,0.229310
96,0.076037
97,0.149160
98,0.201117


In [118]:
bookings['number_of_bookings'] = pd.DataFrame(new_df.groupby('hotel_id')['booking'].count())
bookings.head()

Unnamed: 0_level_0,booking,number_of_bookings
hotel_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.117794,399
1,0.252396,313
2,0.209877,486
3,0.113043,345
4,0.184035,451


In [86]:
bookings_matrix = new_df.pivot_table(index = 'user_id', columns = 'hotel_id', values = 'booking')
bookings_matrix.head()

hotel_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,,1.0,,,,,,,,,...,,,,,,,,,,
93,,,,,,,,,,,...,,,0.0,,,,,,,
501,,,,,,,,,,,...,,,,,,,,,0.0,
756,,,1.0,,,,,,,,...,,,,,0.0,,,,,
776,,,,,,,,,,,...,,,,,,,,,,


In [87]:
bookings_matrix = bookings_matrix.fillna(0)
bookings_matrix

hotel_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
756,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
390286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
390390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
390810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
def find_similar_hotel_clusters(hotel_cluster): 
    item_user_bookings = bookings_matrix[hotel_cluster]
    hotel_correlation = bookings_matrix.corrwith(item_user_bookings)
    corr_clusterid = pd.DataFrame(hotel_correlation, columns=['correlation_value'])
    corr_clusterid.loc[~(corr_clusterid==0).all(axis=1)]
    corr_clusterid = corr_clusterid.round({'correlation_value': 2})
    return corr_clusterid.sort_values('correlation_value',ascending=False).head(n = 5)

In [89]:
find_similar_hotel_clusters(11)

Unnamed: 0_level_0,correlation_value
hotel_id,Unnamed: 1_level_1
11,1.0
82,0.13
20,0.12
33,0.12
46,0.11


## Collaborative Filtering using Cosine Similarity

In [90]:
users_count, hotels_count = new_df.user_id.unique().shape[0], new_df.hotel_id.unique().shape[0]

In [91]:
print("The no of unique users = ",users_count)
print("The no of hotel clusters = ",hotels_count)

The no of unique users =  3478
The no of hotel clusters =  100


In [92]:
def map_ids_to_idx(df):
    user_idx = np.arange(0, users_count)
    hotel_idx = np.arange(0, hotels_count)
    
    idx_user_df = pd.DataFrame(df.user_id.unique(), columns=["user"])
    idx_user_df["idx"] = user_idx
    
    idx_hotel_df = pd.DataFrame(df.hotel_id.unique(), columns=["hotel"])
    idx_hotel_df["idx"] = hotel_idx
    
    df["hotel_index"] = df["hotel_id"].map(idx_hotel_df.set_index('hotel')["idx"]).fillna(0)
    df["user_index"] = df["user_id"].map(idx_user_df.set_index('user')["idx"]).fillna(0)

    return df

In [93]:
new_df = map_ids_to_idx(new_df)
new_df

Unnamed: 0,user_id,hotel_id,booking,hotel_index,user_index
0,12,1,1,0,0
1,93,21,0,1,1
2,93,80,0,2,1
3,93,92,0,3,1
4,501,10,0,4,2
...,...,...,...,...,...
40806,391007,81,0,29,3477
40807,391007,85,0,31,3477
40808,391007,90,0,62,3477
40809,391007,93,0,86,3477


#### Splitting the data into training and testing

In [150]:
train_data, test_data = train_test_split(new_df, test_size = 0.3)

In [99]:
utility_matrix = np.zeros((users_count, hotels_count))
for row in new_df.itertuples():
    utility_matrix[row[5], row[4]] = row[3]
utility_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Creating utility matrix for training data and testing data

In [102]:
train_utility_matrix = np.zeros((users_count, hotels_count))
for row in train_data.itertuples():
    train_utility_matrix[row[5], row[4]] = row[3]
train_utility_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [152]:
test_utility_matrix = np.zeros((users_count, hotels_count))
for row in test_data.itertuples():
    test_utility_matrix[row[5], row[4]] = row[3]
test_utility_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Calculating the cosine similarity

In [107]:
similarity_btw_users = pairwise_distances(train_utility_matrix, metric = 'cosine')
similarity_btw_users

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [108]:
transpose_utility_matrix = train_utility_matrix.T
similarity_btw_items = pairwise_distances(transpose_utility_matrix, metric = 'cosine')
similarity_btw_items

array([[0.        , 0.93517963, 0.96934303, ..., 1.        , 0.93868607,
        1.        ],
       [0.93517963, 0.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.96934303, 1.        , 0.        , ..., 1.        , 0.94736842,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        0.42264973],
       [0.93868607, 1.        , 0.94736842, ..., 1.        , 0.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 0.42264973, 1.        ,
        0.        ]])

### Performing predictions based on cosine similarity

In [135]:
def user_prediction(matrix, similarity_measure):
    avg_user_booking = matrix.mean(axis = 1)
    diff_of_bookings = (matrix - avg_user_booking[:, np.newaxis])
    pred = avg_user_booking[:, np.newaxis] + similarity_measure.dot(diff_of_bookings) / np.array([np.abs(similarity_measure).sum(axis=1)]).T
    return pred

In [136]:
user_pred = user_prediction(train_utility_matrix, similarity_btw_users)

In [137]:
user_pred

array([[ 0.00292206,  0.00637331, -0.0077193 , ..., -0.01289617,
        -0.0077193 , -0.01232097],
       [ 0.00292206,  0.00637331, -0.0077193 , ..., -0.01289617,
        -0.0077193 , -0.01232097],
       [ 0.00292206,  0.00637331, -0.0077193 , ..., -0.01289617,
        -0.0077193 , -0.01232097],
       ...,
       [ 0.02304209,  0.02650356,  0.0125463 , ...,  0.00732606,
         0.01249335,  0.00790609],
       [ 0.00292206,  0.00637331, -0.0077193 , ..., -0.01289617,
        -0.0077193 , -0.01232097],
       [ 0.01289838,  0.01622255,  0.00251457, ..., -0.00257821,
         0.00268334, -0.00199359]])

In [138]:
def item_prediction(matrix, similarity_measure):
    pred = matrix.dot(similarity_measure) / np.array([np.abs(similarity_measure).sum(axis=1)])
    return pred

In [139]:
item_pred = item_prediction(train_utility_matrix, similarity_btw_items)

In [148]:
item_pred.shape

(3478, 100)

In [160]:
def rmse_value(predicted, test):
    predicted = predicted[test.nonzero()].flatten()
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(predicted, test))

def mae_value(predicted, test):
    predicted = predicted[test.nonzero()].flatten()
    test = test[test.nonzero()].flatten()
    return sqrt(mean_absolute_error(predicted, test))

#### The RMSE and MAE of user based CF 

In [167]:
print("RMSE = ", rmse_value(user_pred, test_utility_matrix))
print("MAE = ", mae_value(user_pred, test_utility_matrix))

RMSE =  0.9578120490541507
MAE =  0.9781176746622685


#### The RMSE and MAE of item based CF

In [166]:
print("RMSE = ", rmse_value(item_pred, test_utility_matrix))
print("MAE = ", mae_value(item_pred, test_utility_matrix))

RMSE =  0.9654461535613889
MAE =  0.9820789266734439


In [None]:
https://github.com/ShrutiSavardekar/ExpediaHotelRecommendation.git