<h1>Importing the raw dataset from a CSV file</h1>

In [1]:
import pandas as pd

df=pd.read_csv("Hotel_Reviews.csv")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Hotel_Address                               515738 non-null  object 
 1   Additional_Number_of_Scoring                515738 non-null  int64  
 2   Review_Date                                 515738 non-null  object 
 3   Average_Score                               515738 non-null  float64
 4   Hotel_Name                                  515738 non-null  object 
 5   Reviewer_Nationality                        515738 non-null  object 
 6   Negative_Review                             515738 non-null  object 
 7   Review_Total_Negative_Word_Counts           515738 non-null  int64  
 8   Total_Number_of_Reviews                     515738 non-null  int64  
 9   Positive_Review                             515738 non-null  object 
 

<h1>Dropping unnecessary Columns</h1>

In [3]:
df.drop(["Additional_Number_of_Scoring","Reviewer_Nationality","Negative_Review","Review_Total_Negative_Word_Counts","Total_Number_of_Reviews","Review_Total_Positive_Word_Counts","Total_Number_of_Reviews_Reviewer_Has_Given","days_since_review","lat","lng"],axis=1,inplace=True)

<h1>Making a new Column to store Country</h1>

In [4]:
df.Hotel_Address = df.Hotel_Address.str.replace("Netherlands","NL")
df.Hotel_Address = df.Hotel_Address.str.replace("United Kingdom","UK")
df.Hotel_Address = df.Hotel_Address.str.replace("France","FR")
df.Hotel_Address = df.Hotel_Address.str.replace("Spain","ES")
df.Hotel_Address = df.Hotel_Address.str.replace("Italy","IT")
df.Hotel_Address = df.Hotel_Address.str.replace("Austria","AT")

df["countries"] = df.Hotel_Address.apply(lambda x : x.split(" ")[-1])

In [5]:
df["Positive_Review"]=df["Positive_Review"].apply(lambda x: x.lower())

<h1>Cleaning the tags column by removing punctuations ( {} , [] ; . )</h1>

In [6]:
from ast import literal_eval

def taglist(column):        #picking each tag cell, using literal_eval to identify and remove symbols to get only the words
    column=column[0]
    if(type(column) != list):
        return "".join(literal_eval(column))
    else:
        return column

df["Tags"]=df[["Tags"]].apply(taglist,axis=1)
df["Tags"]=df["Tags"].apply(lambda x: x.lower())

  column=column[0]


<h1>Adding stars for the hotels randomly</h1>

In [7]:
import random
hotelNames=df['Hotel_Name']
allHotels=[]                            #making a list to store all hotel names

for i in hotelNames:
    if i not in allHotels:
        allHotels.append(i)

hotelStars={}                           #making a dictionary(hashmap) to store stars for each hotel
for i in allHotels:
    hotelStars[i]=random.randint(3,5)

df['Stars']=df['Hotel_Name'].map(hotelStars)        #mapping the stars from the hotel name into a new column

In [8]:
df.head()

Unnamed: 0,Hotel_Address,Review_Date,Average_Score,Hotel_Name,Positive_Review,Reviewer_Score,Tags,countries,Stars
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,8/3/2017,7.7,Hotel Arena,only the park outside of the hotel was beauti...,2.9,leisure trip couple duplex double room sta...,NL,4
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,8/3/2017,7.7,Hotel Arena,no real complaints the hotel was great great ...,7.5,leisure trip couple duplex double room sta...,NL,4
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/31/2017,7.7,Hotel Arena,location was good and staff were ok it is cut...,7.1,leisure trip family with young children dup...,NL,4
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/31/2017,7.7,Hotel Arena,great location in nice surroundings the bar a...,3.8,leisure trip solo traveler duplex double ro...,NL,4
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/24/2017,7.7,Hotel Arena,amazing location and building romantic setting,6.7,leisure trip couple suite stayed 2 nights ...,NL,4


In [9]:
hotelData=df

<h1>Adding Hotel Prices per night</h1>

In [10]:
import random

hotel_dict={}                           #making dictionary to map all hotel names and corresponding stars

for i in range (len(hotelData)):
    if hotelData["Hotel_Name"][i] not in hotel_dict:
        hotel_dict[hotelData["Hotel_Name"][i]]=hotelData["Stars"][i]


hotelPrices={}                          #making a dict that will store hotel names and prices based on the stars rating

for keys in hotel_dict:
    star=hotel_dict[keys]
    if star==5:
        hotelPrices[keys]=random.randint(100,150)*100
    elif star==4:
        hotelPrices[keys]=random.randint(60,100)*100
    elif star==3:
        hotelPrices[keys]=random.randint(20,60)*100

hotelData["Price"]=hotelData["Hotel_Name"].map(hotelPrices)         #mapping the prices according to hotel name into a new column

In [11]:
hotelData.head()

Unnamed: 0,Hotel_Address,Review_Date,Average_Score,Hotel_Name,Positive_Review,Reviewer_Score,Tags,countries,Stars,Price
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,8/3/2017,7.7,Hotel Arena,only the park outside of the hotel was beauti...,2.9,leisure trip couple duplex double room sta...,NL,4,8300
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,8/3/2017,7.7,Hotel Arena,no real complaints the hotel was great great ...,7.5,leisure trip couple duplex double room sta...,NL,4,8300
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/31/2017,7.7,Hotel Arena,location was good and staff were ok it is cut...,7.1,leisure trip family with young children dup...,NL,4,8300
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/31/2017,7.7,Hotel Arena,great location in nice surroundings the bar a...,3.8,leisure trip solo traveler duplex double ro...,NL,4,8300
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/24/2017,7.7,Hotel Arena,amazing location and building romantic setting,6.7,leisure trip couple suite stayed 2 nights ...,NL,4,8300


<h1>Adding a new column for Amenities like AC, Gym, etc.</h1>

In [12]:
import random

allHotels=[]                            #list of hotel names

for i in hotelData['Hotel_Name']:
    if i not in allHotels:
        allHotels.append(i)

amenities=["Wifi","TV","Air Conditioning","Room Service","Gym","Swimming Pool","Parking","Fine Dining","BathTub","Spa"]         #making a sample list of
                                                                                                                                #basic amenities

def randomAmen():
    n=random.randint(2,5)
    randomAmenities=random.sample(amenities,n)
    return randomAmenities

hotelAmen={}                                    #making a dict with hotel names and randomly 2 to 5 amenities to each

for i in allHotels:
    hotelAmen[i]=randomAmen()

hotelData['Amenities']=hotelData['Hotel_Name'].map(hotelAmen)       #mapping amenities to new column

In [13]:
hotelData.head()

Unnamed: 0,Hotel_Address,Review_Date,Average_Score,Hotel_Name,Positive_Review,Reviewer_Score,Tags,countries,Stars,Price,Amenities
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,8/3/2017,7.7,Hotel Arena,only the park outside of the hotel was beauti...,2.9,leisure trip couple duplex double room sta...,NL,4,8300,"[Air Conditioning, Parking]"
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,8/3/2017,7.7,Hotel Arena,no real complaints the hotel was great great ...,7.5,leisure trip couple duplex double room sta...,NL,4,8300,"[Air Conditioning, Parking]"
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/31/2017,7.7,Hotel Arena,location was good and staff were ok it is cut...,7.1,leisure trip family with young children dup...,NL,4,8300,"[Air Conditioning, Parking]"
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/31/2017,7.7,Hotel Arena,great location in nice surroundings the bar a...,3.8,leisure trip solo traveler duplex double ro...,NL,4,8300,"[Air Conditioning, Parking]"
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,7/24/2017,7.7,Hotel Arena,amazing location and building romantic setting,6.7,leisure trip couple suite stayed 2 nights ...,NL,4,8300,"[Air Conditioning, Parking]"


<h1>Creating a new Column for NLP process by concatenating Amenities, Tags, Positive Review Comments</h1>

In [14]:
hotelData['AmenList']=hotelData['Amenities'].apply(lambda i : " ".join(i))
hotelData['AmenList']=hotelData['AmenList'].apply(lambda i : i.lower())
hotelData['Words']=hotelData['Tags']+" "+hotelData['Positive_Review']+" "+hotelData['AmenList']             #Concatenating the 3 columns into a new one
hotelData.drop("AmenList",axis=1,inplace=True)

In [15]:
hotelData['Review_Date']=pd.to_datetime(hotelData['Review_Date'])
hotelData['isRecent']=(hotelData['Review_Date'].dt.year==2017)      #making a mask column if the review date is
                                                                    #recent or not

<h1>Cleaning is Finished for the Dataset</h1>

In [16]:
hotelData.head()

Unnamed: 0,Hotel_Address,Review_Date,Average_Score,Hotel_Name,Positive_Review,Reviewer_Score,Tags,countries,Stars,Price,Amenities,Words,isRecent
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,2017-08-03,7.7,Hotel Arena,only the park outside of the hotel was beauti...,2.9,leisure trip couple duplex double room sta...,NL,4,8300,"[Air Conditioning, Parking]",leisure trip couple duplex double room sta...,True
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,2017-08-03,7.7,Hotel Arena,no real complaints the hotel was great great ...,7.5,leisure trip couple duplex double room sta...,NL,4,8300,"[Air Conditioning, Parking]",leisure trip couple duplex double room sta...,True
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,2017-07-31,7.7,Hotel Arena,location was good and staff were ok it is cut...,7.1,leisure trip family with young children dup...,NL,4,8300,"[Air Conditioning, Parking]",leisure trip family with young children dup...,True
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,2017-07-31,7.7,Hotel Arena,great location in nice surroundings the bar a...,3.8,leisure trip solo traveler duplex double ro...,NL,4,8300,"[Air Conditioning, Parking]",leisure trip solo traveler duplex double ro...,True
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam NL,2017-07-24,7.7,Hotel Arena,amazing location and building romantic setting,6.7,leisure trip couple suite stayed 2 nights ...,NL,4,8300,"[Air Conditioning, Parking]",leisure trip couple suite stayed 2 nights ...,True


<h1>Making the function that returns recommended hotels based on parameters</h1>

In [17]:
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
hotelData=pickle.load(open("hotelModel2.pkl","rb"))

def findMyHotel4(df, country, sortBy, stars, range, query):                 #parameters are dataframe, country, sortBy-it defines how user wants the result
                                                                            #to be sorted, how many stars(3,4,5 or any), price range(min,max)
                                                                            #and a text query describing the type of room user wants.
    #country
    countryMask=df["countries"].str.contains(country,case=False)            #making a mask filtering out other countries

    #stars
    starMask=pd.Series([True]*len(df))                                      #making a mask filtering out based on star
    if stars!=0:
        if stars==3:
            starMask=df['Stars']==3
        elif stars==4:
            starMask=df['Stars']==4
        elif stars==5:
            starMask=df['Stars']==5

    
    #price
    min=range[0]
    max=range[1]
    minPriceMask=df['Price']>=min
    maxPriceMask=df['Price']<=max
    combinedPriceMask=minPriceMask & maxPriceMask                           #filtering out the prices that are too high and too low

    #filter
    filterDf=df[countryMask & starMask & combinedPriceMask]
    filterDf=filterDf.drop_duplicates(['Hotel_Name'])                       #new filtered dataframe

    #process text
    def processText(text):                                                  #function to process the query text
        stopWords=set(stopwords.words("english"))                           #calling the function to filter out stopwords
        words=word_tokenize(text)                                           #converting tokens from the input string
        filterWords=[i for i in words if i not in stopWords]                #filtering out stopwords like (an,was,the,like,I,me, etc)
        lemmat=WordNetLemmatizer()                                          #calling the lemmatizer function
        lemmatWords=[lemmat.lemmatize(i) for i in filterWords]              #lemmatizing the tokenized query
        return " ".join(lemmatWords)                                        #returning the final lemmatized query
    
    #processing the query
    queryText=processText(query)                                            #calling the lemmatizer function

    #creating the vectors and finding cos product
    tfidf=TfidfVectorizer(stop_words="english")                             #calling Tf-Idf function 
    tfMatrix=tfidf.fit_transform(filterDf['Words'])                         #making a matrix of vectors from the words column
    queryVector=tfidf.transform([queryText.lower()])                        #vectorizing the queryText into a numerical value
    cosProd=cosine_similarity(queryVector,tfMatrix).flatten()               #finding cos product of the matrix and queryVector
    hotelsFound=cosProd.argsort()[-100:][::-1]                              #sorting values of the cosproducts to get top 100 values
    resultDf=filterDf.iloc[hotelsFound]                                     #making a result dataframe that will have top 100 hotels that meet query

    if sortBy==0:
        resultDf=resultDf.sort_values('Average_Score',ascending=False)          #sorting based on hotel Rating
    elif sortBy==1:
        resultDf=resultDf.sort_values('Reviewer_Score',ascending=False)         #sorting based on Reviewer Score
    elif sortBy==2:
        resultDf=resultDf.sort_values('Price',ascending=True)                   #sorting based on price
    else:
        print("Wrong filter")
        return
    
    resultDf=resultDf.head(5)                                                   #picking only the top 5 hotels and returning
    return resultDf

findMyHotel4(hotelData,"UK",2,5,[1200,1500],"i am going on a business trip")    #calling the function with sample data

Unnamed: 0,Hotel_Address,Average_Score,Reviewer_Score,Hotel_Name,Positive_Review,Tags,Words,countries,Stars,Price,Amenities,ReviewDate,isRecent
54026,15 Cromwell Place Kensington and Chelsea Londo...,8.6,10.0,The Pelham Starhotels Collezione,this was in a great location museums on the c...,leisure trip group deluxe double or twin ro...,leisure trip group deluxe double or twin ro...,UK,5,1210,"[Fine Dining, Spa]",2017-07-27,True
132593,291 Greenwich High Road Greenwich Greenwich Lo...,8.2,7.9,Innkeeper s Lodge London Greenwich,fantastic location opposite greenwich market ...,leisure trip couple standard double room s...,leisure trip couple standard double room s...,UK,5,1210,"[Swimming Pool, Spa, Wifi, Gym]",2017-08-02,False
18445,10 Carlisle Street Westminster Borough London ...,9.0,9.6,The Nadler Soho,no positive,leisure trip family with young children nad...,leisure trip family with young children nad...,UK,5,1220,"[TV, Wifi, Fine Dining, Gym, Air Conditioning]",2017-08-02,True
70751,17 Nottingham Place Westminster Borough London...,7.6,4.6,Hotel La Place,good access to tube could leave luggage there...,leisure trip solo traveler single room sta...,leisure trip solo traveler single room sta...,UK,5,1220,"[Spa, Parking, Gym, Fine Dining]",2017-08-01,False
240501,7 Western Gateway Royal Victoria Dock Newham L...,8.5,9.2,Novotel London Excel,no positive,leisure trip group standard queen room with...,leisure trip group standard queen room with...,UK,5,1220,"[Spa, Wifi]",2017-08-03,False


<h1>Function to check precision of the model</h1>

In [18]:
import pandas as pd
import nltk            #natural language
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def findHotel(query):
    filterDf = hotelData.sort_values(['Reviewer_Score', 'Hotel_Name'], ascending=[False, True]).drop_duplicates('Hotel_Name',keep='first')
    #filterDf=hotelData.drop_duplicates('Hotel_Name',keep='first')

    #process text
    def processText(text):
        stopWords=set(stopwords.words("english"))
        words=word_tokenize(text)
        filterWords=[i for i in words if i not in stopWords]
        lemmat=WordNetLemmatizer()
        lemmatWords=[lemmat.lemmatize(i) for i in filterWords]
        return " ".join(lemmatWords)
    
    #processing the query
    queryText=processText(query)

    #creating the vectors and finding cos product
    tfidf=TfidfVectorizer(stop_words="english")
    tfMatrix=tfidf.fit_transform(filterDf['Words'])
    queryVector=tfidf.transform([queryText.lower()])
    cosProd=cosine_similarity(queryVector,tfMatrix).flatten()
    hotelsFound=cosProd.argsort()[-100:][::-1]
    resultDf=filterDf.iloc[hotelsFound]
    
    resultDf=resultDf.reset_index(drop=True)
    return resultDf['Hotel_Name']

hotelNames=findHotel("spacious couple room")
print(hotelNames)

0                      Catalonia Catedral
1                        The Lanesborough
2                  IH Hotels Milano Gioia
3                          Catalonia Born
4                               Ozo Hotel
                     ...                 
95                     Hotel Montalembert
96         Inntel Hotels Amsterdam Centre
97    Golden Tulip Bercy Gare de Lyon 209
98                   Rainers Hotel Vienna
99                             Park Hotel
Name: Hotel_Name, Length: 100, dtype: object


<h1>Checking the Precision, Recall and F1 Score</h1>

In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle

truthHotels=pickle.load(open("truthHotels.pkl","rb"))       #importing ground truth hotels
allQueries=pickle.load(open("allQueries.pkl","rb"))         #importing sample queries

yTrue=[]
yPred=[]

for i in allQueries:
    hotelsFound=findHotel(i[1])
    yTrue.extend(truthHotels[i[0]])                     #making a list of actual hotels based on queries VS predicted hotels based on queries
    yPred.extend(hotelsFound)

precision=precision_score(yTrue,yPred,average='weighted')*100       #printing the values out of 100%
recall=recall_score(yTrue,yPred,average='weighted')*100
f1=f1_score(yTrue,yPred,average='weighted')*100

print("Precision : ",f"{precision:.2f}","\nRecall : ",f"{recall:.2f}","\nF1 Score : ",f"{f1:.2f}")

Precision :  88.95 
Recall :  80.45 
F1 Score :  83.10


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<h1>Exporting the model into a pickle file in binary format</h1>

In [20]:
import pickle

pickle.dump(hotelData, open("hotelModelPickle.pkl","wb"))