BUSINESS PROBLEM:

- To recommend users hotel based on their previous choice of ratings of the hotels

ML problem:

- The machine learning approach to this would in the form of recommendation system based on collaborative filtering

- Here we used the user based similarity to find the similar hotels for the customers

Recommendation for business:

- Check for the similar hotels, their functionality and works.
- Take into accounts the influx of customes and prepare accordingly.
- In hospitality industry "Customer is god" and should abide along the lines of it.
- Consider what the customers had to say about them 

# IMPORT REQUIRED LIBRARIES

In [1]:
from surprise import Dataset
from surprise import Reader, KNNWithMeans
from surprise.model_selection import cross_validate
import pandas as pd

READING RATINGS DATASET

In [2]:
rating = pd.read_csv('user_hotel_rating-1555730075105.csv')

In [3]:
rating.head(4)

Unnamed: 0,userid,Hotelid,OverallRating
0,user_78131,hotel_558,3
1,user_78131,hotel_603,2
2,user_78131,hotel_610,3
3,user_78131,hotel_574,3


In [4]:
rating.shape

(271935, 3)

READING NEW HOTELS DATA

In [5]:
hotel = pd.read_csv('NewHotels_CutstomerVisitsdata-1554810098964.csv')

In [6]:
print('The number of hotels present in the dataset are: {}'.format(rating.Hotelid.nunique()))

The number of hotels present in the dataset are: 130


In [7]:
print('The number of unique users present in the dataset are: {}'.format(rating.userid.nunique()))

The number of unique users present in the dataset are: 5010


In [8]:
print('The number of unique ratings present in the dataset are: {}'.format(rating.OverallRating.nunique()))

The number of unique ratings present in the dataset are: 5


In [9]:
no_of_users = 5010

In [10]:
## Preparing data to make it compatible for "Surprise" Package
reader = Reader(line_format = 'user item rating', rating_scale=(0, 5))

In [11]:
## Reader is a function which returns a object compaitable with surprise package
data = Dataset.load_from_df(rating, reader)

In [12]:
print (type(reader))
print (type(data))

<class 'surprise.reader.Reader'>
<class 'surprise.dataset.DatasetAutoFolds'>


In [13]:
data.df.head(4)

Unnamed: 0,userid,Hotelid,OverallRating
0,user_78131,hotel_558,3
1,user_78131,hotel_603,2
2,user_78131,hotel_610,3
3,user_78131,hotel_574,3


In [14]:
len(data.df)

271935

#### Simulation Parameters
-  Algorithm Type
-  User-Based vs Item-Based
-  Similarity Metric

In [15]:
sim_parameters = {'name': 'cosine',
               'user_based': True 
               }
algo = KNNWithMeans(sim_options=sim_parameters)

CROSS VALIDATION ACCURACIES

In [16]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=2, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    0.8562  0.8522  0.8542  0.0020  
MAE (testset)     0.6478  0.6460  0.6469  0.0009  
Fit time          29.39   29.44   29.42   0.02    
Test time         87.45   88.45   87.95   0.50    


{'test_rmse': array([0.85616079, 0.85220916]),
 'test_mae': array([0.64779968, 0.64601669]),
 'fit_time': (29.39468264579773, 29.436026096343994),
 'test_time': (87.44979977607727, 88.45021867752075)}

In [17]:
# Using full data for training
trainset = data.build_full_trainset()
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x222a1c66e80>

In [18]:
# Getting data points where predictions can be made
testset = trainset.build_anti_testset()

In [19]:
print (type(testset))

<class 'list'>


In [20]:
testset

[('user_78131', 'hotel_517', 4.257112913012301),
 ('user_78131', 'hotel_519', 4.257112913012301),
 ('user_78131', 'hotel_565', 4.257112913012301),
 ('user_78131', 'hotel_630', 4.257112913012301),
 ('user_78131', 'hotel_601', 4.257112913012301),
 ('user_78131', 'hotel_604', 4.257112913012301),
 ('user_78131', 'hotel_608', 4.257112913012301),
 ('user_78131', 'hotel_580', 4.257112913012301),
 ('user_78131', 'hotel_622', 4.257112913012301),
 ('user_78131', 'hotel_502', 4.257112913012301),
 ('user_78131', 'hotel_624', 4.257112913012301),
 ('user_78131', 'hotel_583', 4.257112913012301),
 ('user_78131', 'hotel_576', 4.257112913012301),
 ('user_78131', 'hotel_572', 4.257112913012301),
 ('user_78131', 'hotel_563', 4.257112913012301),
 ('user_78131', 'hotel_594', 4.257112913012301),
 ('user_78131', 'hotel_602', 4.257112913012301),
 ('user_78131', 'hotel_590', 4.257112913012301),
 ('user_78131', 'hotel_627', 4.257112913012301),
 ('user_78131', 'hotel_615', 4.257112913012301),
 ('user_78131', 'hot

In [21]:
print (len(testset))

379365


In [22]:
# Making predictions
predictions = algo.test(testset)

In [23]:
# Fetching top 5 predictions for each user
from collections import defaultdict
def get_top_n(predictions, n=5):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 5.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))


In [24]:
top_n = get_top_n(predictions, n=5)

In [25]:
top_n.items()

dict_items([('user_78131', [('hotel_630', 3.5946772383168364), ('hotel_573', 3.565223498684065), ('hotel_601', 3.552722776469489), ('hotel_519', 3.5332729232214977), ('hotel_507', 3.5087394085748658)]), ('user_21002', [('hotel_528', 3.6497634454363066), ('hotel_509', 3.567676582576768), ('hotel_567', 3.543997320682302), ('hotel_515', 3.518673821057046), ('hotel_607', 3.485752762561133)]), ('user_24128', [('hotel_527', 2.9814843494719305), ('hotel_507', 2.845373798910402), ('hotel_528', 2.8277404497544736), ('hotel_573', 2.8248025447586955), ('hotel_612', 2.7939738934827116)]), ('user_27174', [('hotel_515', 2.795939060557687), ('hotel_528', 2.676763609074791), ('hotel_527', 2.585288344180025), ('hotel_523', 2.5734505435000274), ('hotel_565', 2.5623881896390635)]), ('user_36448', [('hotel_573', 3.32702770773854), ('hotel_530', 3.272566506253966), ('hotel_527', 3.271566842845967), ('hotel_601', 3.228309856553227), ('hotel_528', 3.2051095852999945)]), ('user_44791', [('hotel_630', 2.642464

In [26]:
take(5, top_n.items())

[('user_78131',
  [('hotel_630', 3.5946772383168364),
   ('hotel_573', 3.565223498684065),
   ('hotel_601', 3.552722776469489),
   ('hotel_519', 3.5332729232214977),
   ('hotel_507', 3.5087394085748658)]),
 ('user_21002',
  [('hotel_528', 3.6497634454363066),
   ('hotel_509', 3.567676582576768),
   ('hotel_567', 3.543997320682302),
   ('hotel_515', 3.518673821057046),
   ('hotel_607', 3.485752762561133)]),
 ('user_24128',
  [('hotel_527', 2.9814843494719305),
   ('hotel_507', 2.845373798910402),
   ('hotel_528', 2.8277404497544736),
   ('hotel_573', 2.8248025447586955),
   ('hotel_612', 2.7939738934827116)]),
 ('user_27174',
  [('hotel_515', 2.795939060557687),
   ('hotel_528', 2.676763609074791),
   ('hotel_527', 2.585288344180025),
   ('hotel_523', 2.5734505435000274),
   ('hotel_565', 2.5623881896390635)]),
 ('user_36448',
  [('hotel_573', 3.32702770773854),
   ('hotel_530', 3.272566506253966),
   ('hotel_527', 3.271566842845967),
   ('hotel_601', 3.228309856553227),
   ('hotel_528'

In [27]:
# Printing top predictions
for uid, user_ratings in take(5,top_n.items()):
    print(uid, [iid for (iid, _) in user_ratings])

user_78131 ['hotel_630', 'hotel_573', 'hotel_601', 'hotel_519', 'hotel_507']
user_21002 ['hotel_528', 'hotel_509', 'hotel_567', 'hotel_515', 'hotel_607']
user_24128 ['hotel_527', 'hotel_507', 'hotel_528', 'hotel_573', 'hotel_612']
user_27174 ['hotel_515', 'hotel_528', 'hotel_527', 'hotel_523', 'hotel_565']
user_36448 ['hotel_573', 'hotel_530', 'hotel_527', 'hotel_601', 'hotel_528']


In [28]:
hotel.head(3)

Unnamed: 0,Hotelid,userid,reviewtext,Date,NoOfReaders,HelpfulToNoOfreaders,Value_rating,Rooms_rating,Location_rating,Cleanliness_rating,Checkin_rating,Service_rating,Businessservice_rating,AverageOverallRatingOfHotel,AveragePricing
0,hotel_104,hotel_4342,Awesome Everything about this resort was great...,"Jan 5, 2009",0,0,6,6,6,6,6,6,6,4.0,$227
1,hotel_104,hotel_2024,Convenient When I say Above Average I'm compa...,"Nov 10, 2008",0,0,4,4,6,5,4,5,1,4.0,$227
2,hotel_104,hotel_8720,Nice with its quirks Beach looking west,"Nov 3, 2008",0,0,3,4,4,5,6,4,4,4.0,$227


TESTING ON NEW EXISTING USER DATASET

In [29]:
hotel = hotel[['Hotelid','userid','AverageOverallRatingOfHotel']] 

In [30]:
## Preparing data to make it compatible for "Surprise" Package
reader = Reader(line_format = 'item user rating', rating_scale=(0, 5))

In [31]:
#Reader is a function which returns a object compaitable with surprise package
data_h = Dataset.load_from_df(hotel, reader)

In [32]:
data_h = data_h.build_full_trainset()

In [33]:
data_h = data_h.build_anti_testset()

In [34]:
print(type(data_h))

<class 'list'>


In [35]:
predictions1 = algo.test(data_h)

In [36]:
top_h = get_top_n(predictions1, n=5)

In [37]:
top_h.items()

dict_items([('hotel_104', [('hotel_4288', 3.2571129130123007), ('hotel_8047', 3.2571129130123007), ('hotel_9017', 3.2571129130123007), ('hotel_9399', 3.2571129130123007), ('hotel_5637', 3.2571129130123007)]), ('hotel_105', [('hotel_4342', 3.2571129130123007), ('hotel_2024', 3.2571129130123007), ('hotel_8720', 3.2571129130123007), ('hotel_3267', 3.2571129130123007), ('hotel_5419', 3.2571129130123007)]), ('hotel_114', [('hotel_4342', 3.2571129130123007), ('hotel_2024', 3.2571129130123007), ('hotel_8720', 3.2571129130123007), ('hotel_3267', 3.2571129130123007), ('hotel_5419', 3.2571129130123007)]), ('hotel_115', [('hotel_4342', 3.2571129130123007), ('hotel_2024', 3.2571129130123007), ('hotel_8720', 3.2571129130123007), ('hotel_3267', 3.2571129130123007), ('hotel_5419', 3.2571129130123007)]), ('hotel_119', [('hotel_4342', 3.2571129130123007), ('hotel_2024', 3.2571129130123007), ('hotel_8720', 3.2571129130123007), ('hotel_3267', 3.2571129130123007), ('hotel_5419', 3.2571129130123007)]), ('h

In [38]:
take(5, top_h.items())

[('hotel_104',
  [('hotel_4288', 3.2571129130123007),
   ('hotel_8047', 3.2571129130123007),
   ('hotel_9017', 3.2571129130123007),
   ('hotel_9399', 3.2571129130123007),
   ('hotel_5637', 3.2571129130123007)]),
 ('hotel_105',
  [('hotel_4342', 3.2571129130123007),
   ('hotel_2024', 3.2571129130123007),
   ('hotel_8720', 3.2571129130123007),
   ('hotel_3267', 3.2571129130123007),
   ('hotel_5419', 3.2571129130123007)]),
 ('hotel_114',
  [('hotel_4342', 3.2571129130123007),
   ('hotel_2024', 3.2571129130123007),
   ('hotel_8720', 3.2571129130123007),
   ('hotel_3267', 3.2571129130123007),
   ('hotel_5419', 3.2571129130123007)]),
 ('hotel_115',
  [('hotel_4342', 3.2571129130123007),
   ('hotel_2024', 3.2571129130123007),
   ('hotel_8720', 3.2571129130123007),
   ('hotel_3267', 3.2571129130123007),
   ('hotel_5419', 3.2571129130123007)]),
 ('hotel_119',
  [('hotel_4342', 3.2571129130123007),
   ('hotel_2024', 3.2571129130123007),
   ('hotel_8720', 3.2571129130123007),
   ('hotel_3267', 3.