In [256]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from surprise import SVD, accuracy
from surprise import Dataset, Reader, KNNBaseline, SVD, NMF, CoClustering
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [210]:
data = pd.read_csv("./dataset/train.csv", sep = ",", nrows = 100000)

In [211]:
data.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21


In [212]:
data.shape

(100000, 24)

In [213]:
new_df = data[['user_id', 'hotel_cluster', 'is_booking']]

In [214]:
new_df.head()

Unnamed: 0,user_id,hotel_cluster,is_booking
0,12,1,0
1,12,1,1
2,12,1,0
3,93,80,0
4,93,21,0


In [215]:
new_df.columns = ['user_id', 'hotel_id', 'booking']
new_df = new_df.sort_values(['user_id', 'hotel_id'])
new_df = new_df.reset_index().drop('index', axis = 1)
new_df.head()

Unnamed: 0,user_id,hotel_id,booking
0,12,1,0
1,12,1,1
2,12,1,0
3,93,21,0
4,93,80,0


In [216]:
booked_hotels = new_df.groupby(['user_id', 'hotel_id']).booking.transform(max)
new_df = new_df.loc[new_df.booking == booked_hotels]
new_df.drop_duplicates(keep = 'first', inplace = True)
new_df= new_df.reset_index().drop('index', axis = 1)
new_df.head()

Unnamed: 0,user_id,hotel_id,booking
0,12,1,1
1,93,21,0
2,93,80,0
3,93,92,0
4,501,10,0


## Calculated each of the hotel cluster's mean

In [217]:
bookings = pd.DataFrame(new_df.groupby(['hotel_id'])['booking'].mean())
bookings

Unnamed: 0_level_0,booking
hotel_id,Unnamed: 1_level_1
0,0.117794
1,0.252396
2,0.209877
3,0.113043
4,0.184035
...,...
95,0.229310
96,0.076037
97,0.149160
98,0.201117


In [218]:
bookings_matrix = new_df.pivot_table(index = 'user_id', columns = 'hotel_id', values = 'booking')
bookings_matrix.head()

hotel_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,,1.0,,,,,,,,,...,,,,,,,,,,
93,,,,,,,,,,,...,,,0.0,,,,,,,
501,,,,,,,,,,,...,,,,,,,,,0.0,
756,,,1.0,,,,,,,,...,,,,,0.0,,,,,
776,,,,,,,,,,,...,,,,,,,,,,


In [219]:
bookings_matrix = bookings_matrix.fillna(0)
bookings_matrix

hotel_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
756,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
390286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
390390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
390810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
def find_similar_hotel_clusters(hotel_cluster): 
    item_user_bookings = bookings_matrix[hotel_cluster]
    hotel_correlation = bookings_matrix.corrwith(item_user_bookings)
    corr_clusterid = pd.DataFrame(hotel_correlation, columns=['correlation_value'])
    corr_clusterid.loc[~(corr_clusterid==0).all(axis=1)]
    corr_clusterid = corr_clusterid.round({'correlation_value': 2})
    return corr_clusterid.sort_values('correlation_value',ascending=False).head(n = 5)

In [221]:
find_similar_hotel_clusters(11)

Unnamed: 0_level_0,correlation_value
hotel_id,Unnamed: 1_level_1
11,1.0
82,0.13
20,0.12
33,0.12
46,0.11


## Collaborative Filtering using Cosine Similarity

In [222]:
users_count, hotels_count = new_df.user_id.unique().shape[0], new_df.hotel_id.unique().shape[0]

In [223]:
print("The no of unique users = ",users_count)
print("The no of hotel clusters = ",hotels_count)

The no of unique users =  3478
The no of hotel clusters =  100


In [224]:
def map_ids_to_idx(df):
    user_idx = np.arange(0, users_count)
    hotel_idx = np.arange(0, hotels_count)
    
    idx_user_df = pd.DataFrame(df.user_id.unique(), columns=["user"])
    idx_user_df["idx"] = user_idx
    
    idx_hotel_df = pd.DataFrame(df.hotel_id.unique(), columns=["hotel"])
    idx_hotel_df["idx"] = hotel_idx
    
    df["hotel_index"] = df["hotel_id"].map(idx_hotel_df.set_index('hotel')["idx"]).fillna(0)
    df["user_index"] = df["user_id"].map(idx_user_df.set_index('user')["idx"]).fillna(0)

    return df

In [225]:
new_df = map_ids_to_idx(new_df)
new_df

Unnamed: 0,user_id,hotel_id,booking,hotel_index,user_index
0,12,1,1,0,0
1,93,21,0,1,1
2,93,80,0,2,1
3,93,92,0,3,1
4,501,10,0,4,2
...,...,...,...,...,...
40806,391007,81,0,29,3477
40807,391007,85,0,31,3477
40808,391007,90,0,62,3477
40809,391007,93,0,86,3477


#### Splitting the data into training and testing

In [226]:
train_data, test_data = train_test_split(new_df, test_size = 0.3)

In [227]:
utility_matrix = np.zeros((users_count, hotels_count))
for row in new_df.itertuples():
    utility_matrix[row[5], row[4]] = row[3]
utility_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Creating utility matrix for training data and testing data

In [228]:
train_utility_matrix = np.zeros((users_count, hotels_count))
for row in train_data.itertuples():
    train_utility_matrix[row[5], row[4]] = row[3]
train_utility_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [229]:
test_utility_matrix = np.zeros((users_count, hotels_count))
for row in test_data.itertuples():
    test_utility_matrix[row[5], row[4]] = row[3]
test_utility_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Calculating the cosine similarity

In [230]:
similarity_btw_users = pairwise_distances(train_utility_matrix, metric = 'cosine')
similarity_btw_users

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [231]:
transpose_utility_matrix = train_utility_matrix.T
similarity_btw_items = pairwise_distances(transpose_utility_matrix, metric = 'cosine')
similarity_btw_items

array([[0.        , 0.98371344, 1.        , ..., 1.        , 0.97200537,
        0.95037083],
       [0.98371344, 0.        , 1.        , ..., 0.9122942 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.        , ..., 1.        , 0.94086876,
        0.89517152],
       ...,
       [1.        , 0.9122942 , 1.        , ..., 0.        , 1.        ,
        0.73273876],
       [0.97200537, 1.        , 0.94086876, ..., 1.        , 0.        ,
        1.        ],
       [0.95037083, 1.        , 0.89517152, ..., 0.73273876, 1.        ,
        0.        ]])

### Performing predictions based on cosine similarity

In [232]:
def user_prediction(matrix, similarity_measure):
    avg_user_booking = matrix.mean(axis = 1)
    diff_of_bookings = (matrix - avg_user_booking[:, np.newaxis])
    pred = avg_user_booking[:, np.newaxis] + similarity_measure.dot(diff_of_bookings) / np.array([np.abs(similarity_measure).sum(axis=1)]).T
    return pred

In [233]:
user_pred = user_prediction(train_utility_matrix, similarity_btw_users)

In [234]:
user_pred

array([[ 0.0025904 ,  0.01582858,  0.00081123, ..., -0.00238726,
         0.00335047, -0.00113901],
       [ 0.00355191,  0.00556514, -0.00939028, ..., -0.01255393,
        -0.00680184, -0.0111159 ],
       [ 0.00355191,  0.00556514, -0.00939028, ..., -0.01255393,
        -0.00680184, -0.0111159 ],
       ...,
       [ 0.0237431 ,  0.02587444,  0.01080289, ...,  0.00761468,
         0.01341143,  0.00906387],
       [ 0.00355191,  0.00556514, -0.00939028, ..., -0.01255393,
        -0.00680184, -0.0111159 ],
       [ 0.00355191,  0.00556514, -0.00939028, ..., -0.01255393,
        -0.00680184, -0.0111159 ]])

In [235]:
def item_prediction(matrix, similarity_measure):
    pred = matrix.dot(similarity_measure) / np.array([np.abs(similarity_measure).sum(axis=1)])
    return pred

In [236]:
item_pred = item_prediction(train_utility_matrix, similarity_btw_items)

In [237]:
item_pred.shape

(3478, 100)

In [238]:
def rmse_value(predicted, test):
    predicted = predicted[test.nonzero()].flatten()
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(predicted, test))

def mae_value(predicted, test):
    predicted = predicted[test.nonzero()].flatten()
    test = test[test.nonzero()].flatten()
    return sqrt(mean_absolute_error(predicted, test))

#### The RMSE and MAE of user based CF 

In [239]:
print("RMSE = ", rmse_value(user_pred, test_utility_matrix))
print("MAE = ", mae_value(user_pred, test_utility_matrix))

RMSE =  0.9603272942070055
MAE =  0.9794790510859709


#### The RMSE and MAE of item based CF

In [240]:
print("RMSE = ", rmse_value(item_pred, test_utility_matrix))
print("MAE = ", mae_value(item_pred, test_utility_matrix))

RMSE =  0.9650291353355701
MAE =  0.981891477437871


In [241]:
reader = Reader()
bookings_data = Dataset.load_from_df(new_df[['user_id', 'hotel_id', 'booking']], reader)

In [242]:
algorithm_1 = KNNBaseline()
res_1 = cross_validate(algorithm_1, bookings_data, measures=['RMSE', 'MAE'], cv = 5, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9165  0.9204  0.9133  0.9158  0.9190  0.9170  0.0025  
MAE (testset)     0.8399  0.8471  0.8341  0.8388  0.8446  0.8409  0.0046  
Fit time          1.36    1.39    1.45    1.31    1.79    1.46    0.17    
Test time         4.04    4.39    4.82    4.41    4.36    4.40    0.25    


In [243]:
algorithm_2 = SVD()
res_2 = cross_validate(algorithm_2, bookings_data, measures=['RMSE', 'MAE'], cv = 5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9159  0.9186  0.9178  0.9170  0.9158  0.9170  0.0011  
MAE (testset)     0.8388  0.8438  0.8423  0.8408  0.8388  0.8409  0.0020  
Fit time          2.68    2.46    2.74    2.72    2.93    2.71    0.15    
Test time         0.08    0.07    0.07    0.07    0.06    0.07    0.01    


In [244]:
algorithm_3 = CoClustering()
res_3 = cross_validate(algorithm_3, bookings_data, measures=['RMSE', 'MAE'], cv = 5, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9175  0.9150  0.9174  0.9171  0.9190  0.9172  0.0013  
MAE (testset)     0.8418  0.8370  0.8416  0.8412  0.8445  0.8412  0.0024  
Fit time          1.56    1.66    1.92    1.41    1.74    1.66    0.17    
Test time         0.04    0.04    0.18    0.06    0.10    0.08    0.05    


In [245]:
def avg_rmse(result):
    return sum(result['test_rmse']) / 5

In [246]:
avg_rmse(res_1)

0.9170030370209437

In [247]:
avg_rmse(res_2)

0.9170057997852871

In [248]:
avg_rmse(res_3)

0.917205604803805

In [251]:
param_grid = {'n_epochs': [5, 10, 30], 'lr_all': [0.002, 0.006],
              'reg_all': [0.05, 0.4, 0.6], 'n_factors' : [10, 20, 30]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)

gs.fit(bookings_data)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.917004376325073
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 10}


In [253]:
train_bookings_data, test_bookings_data = train_test_split(bookings_data, test_size = 0.3)

In [254]:
svd_algo = SVD(n_factors = 10, n_epochs = 5, lr_all = 0.002, reg_all = 0.05)
svd_algo.fit(train_bookings_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20807c2cb88>

In [261]:
predictions = svd_algo.test(test_bookings_data)
print("Accuracy = ",accuracy.rmse(predictions))

RMSE: 0.9158
Accuracy =  0.9158037477594877


In [262]:
predictions[0:5]

[Prediction(uid=186793, iid=72, r_ui=0.0, est=1, details={'was_impossible': False}),
 Prediction(uid=185616, iid=28, r_ui=0.0, est=1, details={'was_impossible': False}),
 Prediction(uid=337490, iid=56, r_ui=1.0, est=1, details={'was_impossible': False}),
 Prediction(uid=6330, iid=36, r_ui=0.0, est=1, details={'was_impossible': False}),
 Prediction(uid=204270, iid=65, r_ui=1.0, est=1, details={'was_impossible': False})]

In [265]:
test_bookings = []
for pred in predictions:
    test_bookings.append(pred[3])

In [286]:
final_test_bookings = pd.DataFrame(predictions)

In [287]:
final_test_bookings =final_test_bookings.drop(columns= ['r_ui', 'details'])

In [288]:
final_test_bookings.columns = ['user_id', 'hotel_id', 'is_booking']

In [289]:
final_test_bookings.head()

Unnamed: 0,user_id,hotel_id,is_booking
0,186793,72,1
1,185616,28,1
2,337490,56,1
3,6330,36,1
4,204270,65,1
