# Recommender System on Hotel's Features Dataset


<table align="center">
  <td>
    <a href="https://colab.research.google.com/github/ageron/handson-ml2/blob/master/01_the_machine_learning_landscape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
</table>

## 1. Importing Libraries, installing Facebook Sent2Vec Model & loading our dataset.
### 1.1. Mounting and importing libraries

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

### 1.2. Installing **`Sent2Vec`**

In [None]:
!git clone https://github.com/epfml/sent2vec.git

Cloning into 'sent2vec'...
remote: Enumerating objects: 396, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 396 (delta 0), reused 1 (delta 0), pack-reused 393[K
Receiving objects: 100% (396/396), 439.62 KiB | 8.97 MiB/s, done.
Resolving deltas: 100% (247/247), done.


In [None]:
current_path = os.path.abspath(os.getcwd())
new_path = os.path.join(current_path, "sent2vec")
os.chdir(new_path)

In [None]:
!make

!pip install --upgrade cython

!python setup.py build_ext

!pip install .

c++ -pthread -std=c++0x -O3 -funroll-loops -c src/args.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/dictionary.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/productquantizer.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/matrix.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/shmem_matrix.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/qmatrix.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/vector.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/model.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/utils.cc
c++ -pthread -std=c++0x -O3 -funroll-loops -c src/fasttext.cc
c++ -pthread -std=c++0x -O3 -funroll-loops args.o dictionary.o productquantizer.o matrix.o shmem_matrix.o qmatrix.o vector.o model.o utils.o fasttext.o src/main.cc -o fasttext -lrt
Requirement already up-to-date: cython in /usr/local/lib/python3.7/dist-packages (0.29.23)
Compiling src/sent2vec.pyx because it changed.
[1/1] Cythonizing src/sent2vec.pyx
  tree = Parsing.p_module(s

### 1.3. reading **Hotel's Features Dataset** and do some preprocessing on it

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/hotel_features_dataset.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [None]:
df.head(3)

Unnamed: 0,name,country,street,region,rating,reviews,amenities,rooms,types,price,official_description
0,WH Hotel,Lebanon,Lyon Street Hamra Emille Edde,,4.0,"{'Excellent': 104, 'Good': 102, 'Average': 51,...","Free High Speed Internet (WiFi),Free breakfast...","Air conditioning,Fireplace,Housekeeping,Room s...","Non-smoking rooms,Suites,Family rooms,Smoking ...",,
1,Le Patio Boutique Hotel,Lebanon,"1144 Marfaa, Uruguay Street Solidere",,4.5,"{'Excellent': 213, 'Good': 78, 'Average': 31, ...","Paid private parking nearby,Free High Speed In...","Air conditioning,Housekeeping,Room service,Saf...","Non-smoking rooms,Suites,Family rooms,Smoking ...",121.0,
2,Riviera Hotel Beirut,Lebanon,"Avenue De Paris, Corniche El Manara Riad El Solh",,3.5,"{'Excellent': 90, 'Good': 101, 'Average': 69, ...","Valet parking,Free High Speed Internet (WiFi),...","Air conditioning,Private balcony,Room service,...","Ocean view,Pool view,Non-smoking rooms,Suites,...",,


In [None]:
wanted_columns = ['name', 'country', 'rating', 'amenities', 'rooms', 'types']
df = df[wanted_columns]

In [None]:
df = df.dropna()

In [None]:
len(df.index)

62617

In [None]:
features = df['amenities'] + ',' +df['rooms']+ ',' +df['types']

In [None]:
def preprocess_text(sen):
    sentence = re.compile(r'<[^>]+>').sub('', sen)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [None]:
cleaned_features = []
for feature in features:
  cleaned_features.append(preprocess_text(feature).lower())

## 2. Building the recommended
### 2.1. Use Sent2vec model to convert our features to embeddings.

In [None]:
import sent2vec
model = sent2vec.Sent2vecModel()
model.load_model('/content/gdrive/MyDrive/wiki_unigrams.bin', inference_mode = True)

In [None]:
# emb = model.embed_sentence("once upon a time .") 
embs = model.embed_sentences(cleaned_features)

In [None]:
embs.shape

(62617, 600)

In [None]:
features_df = pd.DataFrame({'name': df.name, 'features': cleaned_features})

In [None]:
features_df.head()

Unnamed: 0,name,features
0,WH Hotel,free high speed internet wifi free breakfast a...
1,Le Patio Boutique Hotel,paid private parking nearby free high speed in...
2,Riviera Hotel Beirut,valet parking free high speed internet wifi po...
3,Le Bristol Beyrouth,free high speed internet wifi pool fitness cen...
4,Golden Tulip Midtown Hotel And Suites,free parking free high speed internet wifi poo...


### 2.2. Calculate the similarities between hotels using Cosine Similarly.

In [None]:
def most_similar(idx, similarity_matrix, count):
    print (f'Similar Hotels to {features_df.iloc[idx]["name"]}:')
    similar_ix=np.argsort(similarity_matrix[idx])[::-1]
    for ix in similar_ix[:count+1]:
        if ix != idx:
            print (f'Hotel Name : {features_df.iloc[ix]["name"]} Similarity: {similarity_matrix[idx][ix]}')

In [None]:
pairwise_similarities =cosine_similarity(embs[:40000])

In [None]:
most_similar(5, pairwise_similarities, 6)

Similar Hotels to The Mayflower Hotel:
Hotel Name : Catina Hotel Similarity: 0.9822630882263184
Hotel Name : Erboy Hotel Similarity: 0.9741029739379883
Hotel Name : Treebo Trend Singh Sons Similarity: 0.9699388742446899
Hotel Name : Villa Hue Similarity: 0.9699212312698364
Hotel Name : Hotel Le Carnot Similarity: 0.9684607982635498
Hotel Name : Hotel Sagar Plaza Similarity: 0.9671428203582764
