# NLP Project 
##### Santiago Martin & Léo Ringeissen

## Data loading

In [60]:
import pandas as pd
offerings = pd.read_csv('data/offerings.csv',sep=',',header=0)
reviews = pd.read_csv('data/reviews.csv',sep=',',header=0)

In [61]:
offerings.head()

Unnamed: 0,hotel_class,region_id,url,phone,details,address,type,id,name
0,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '147 West 4...",hotel,113317,Casablanca Hotel Times Square
1,5.0,32655,http://www.tripadvisor.com/Hotel_Review-g32655...,,,"{'region': 'CA', 'street-address': '300 S Dohe...",hotel,76049,Four Seasons Hotel Los Angeles at Beverly Hills
2,3.5,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '790 Eighth...",hotel,99352,Hilton Garden Inn Times Square
3,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '152 West 5...",hotel,93589,The Michelangelo Hotel
4,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '130 West 4...",hotel,217616,The Muse Hotel New York


In [62]:
reviews.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False


## Data Preprocessing

#### Drop useless columns

In [63]:
cols_to_drop_offerings = ['hotel_class', 'region_id', 'url', 'phone', 'details', 'address', 'type']
offerings = offerings.drop(cols_to_drop_offerings, axis=1)

cols_to_drop_review = ['author', 'date_stayed', 'num_helpful_votes', 'date', 'id', 'via_mobile']
reviews = reviews.drop(cols_to_drop_review, axis=1)

#### Create rating columns

In [64]:
import pandas as pd
import ast
required_keys = ['service', 'cleanliness', 'overall', 'value', 'location', 'sleep_quality', 'rooms']

# Function to preprocess the ratings column
def preprocess_ratings(dataframe):
    # Convert the 'ratings' column from string to dictionary using ast.literal_eval
    dataframe['ratings_dict'] = dataframe['ratings'].apply(lambda x: ast.literal_eval(x))
    
    # Define the required keys
    
    # Extract only the required keys from the dictionary
    dataframe['filtered_ratings_dict'] = dataframe['ratings_dict'].apply(
        lambda d: {key: d[key] for key in required_keys if key in d}
    )
    
    # Normalize the filtered dictionary column into separate columns
    ratings_expanded = pd.json_normalize(dataframe['filtered_ratings_dict'])
    
    # Drop rows where any of the required keys are missing
    ratings_filtered = ratings_expanded.dropna(subset=required_keys)
    
    # Add the expanded columns back to the original dataframe
    dataframe = dataframe.join(ratings_filtered)
    
    # Drop the intermediate columns
    dataframe = dataframe.drop(columns=['ratings', 'ratings_dict', 'filtered_ratings_dict'])
    
    return dataframe

# Apply the preprocessing function
reviews = preprocess_ratings(reviews)


In [65]:
reviews.shape

(878561, 10)

In [66]:
reviews.isnull().sum()


title                 0
text                  0
offering_id           0
service          442170
cleanliness      442170
overall          442170
value            442170
location         442170
sleep_quality    442170
rooms            442170
dtype: int64

In [67]:
reviews.dropna(inplace=True)

In [68]:
reviews.shape

(436391, 10)

#### Joining offerings and reviews

In [69]:
# Merge the reviews and offerings dataframes on the offering_id and id columns
merged_df = pd.merge(reviews, offerings, left_on='offering_id', right_on='id')

# Group by the hotel id and name, and calculate the mean for the rating columns and count for the number of reviews
grouped_df = merged_df.groupby(['id', 'name']).agg(
    {cat: 'mean' for cat in required_keys} | {'text': ' '.join} | {'offering_id': 'count'}
).rename(columns={'offering_id': 'num_reviews'}).reset_index()

# Display the resulting dataframe
grouped_df.head()

Unnamed: 0,id,name,service,cleanliness,overall,value,location,sleep_quality,rooms,text,num_reviews
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,4.60101,4.636364,4.388889,4.323232,4.570707,4.333333,4.282828,I had to make fast visit to seattle and I foun...,198
1,72579,BEST WESTERN Loyal Inn,4.232,4.24,3.888,4.152,4.192,3.768,3.856,"Great service, rooms were clean, could use som...",125
2,72586,BEST WESTERN PLUS Executive Inn,4.25,4.287879,4.045455,4.05303,4.537879,4.113636,3.992424,Beautiful views of the space needle - especial...,132
3,72598,Comfort Inn & Suites Seattle,3.243243,3.243243,2.918919,3.054054,3.027027,3.27027,3.189189,This hotel is in need of some serious updates....,37
4,73236,Days Inn San Antonio/Near Lackland AFB,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222,My experience at this days inn was perfect. th...,18


In [70]:
grouped_df.shape

(3754, 11)

In [71]:
grouped_df.head()

Unnamed: 0,id,name,service,cleanliness,overall,value,location,sleep_quality,rooms,text,num_reviews
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,4.60101,4.636364,4.388889,4.323232,4.570707,4.333333,4.282828,I had to make fast visit to seattle and I foun...,198
1,72579,BEST WESTERN Loyal Inn,4.232,4.24,3.888,4.152,4.192,3.768,3.856,"Great service, rooms were clean, could use som...",125
2,72586,BEST WESTERN PLUS Executive Inn,4.25,4.287879,4.045455,4.05303,4.537879,4.113636,3.992424,Beautiful views of the space needle - especial...,132
3,72598,Comfort Inn & Suites Seattle,3.243243,3.243243,2.918919,3.054054,3.027027,3.27027,3.189189,This hotel is in need of some serious updates....,37
4,73236,Days Inn San Antonio/Near Lackland AFB,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222,My experience at this days inn was perfect. th...,18


In [72]:
print(grouped_df['text'][4]) 
# Reviews are purely separated by a space so 'text' is an essay of combined reviews

My experience at this days inn was perfect. the staff were great and the manager (ted angel) was very helpful. The complimentary breakfast was always hot and they also provided bbq grills. I will really recommend this place to others that are planning on staying in san antonio The staff at the front desk were extremely helpful and went out of their way to ensure our trip was enjoyable ! We were attending my nephew's Air Force graduation and the staff gave us very useful information to make the navigation of the base easier. A special thanks to Mr. Angel who worked the front desk the morning of our departure ! The location is great because its close to the base however the hotel is disgusting! It's dirty and there are roaches!!! I'm not sure how they are still open. I will never ever stay at that location or any of their locations because I don't risk having to deal with that environment. I wish people would have mentioned that in their reviews so I could have selected a different hotel

## Sample testing of BM25 baseline

### Initializing BM25

In [73]:
from rank_bm25 import BM25Okapi

corpus = grouped_df['text'].tolist()

tokenized_corpus = [doc.split(" ") for doc in corpus[:1000]]

bm25 = BM25Okapi(tokenized_corpus)

### Ranking the documents

In [None]:
query_id = 420

query = corpus[query_id]
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)

### Results

In [75]:
# the highest index is going to be of the same document we got the query from so we need to get the second highest
doc_id = doc_scores.argsort()[-2]

print(doc_id)

print(grouped_df['text'][doc_id])

157
This trip was only a 24 hour lay over but wanted to plan a surprise for my boyfriend and was connected to Stephen Dyrus the Front Desk Manager. He not only helped me with the surprise he made it very simple and was just so professional. The Hotel is beautiful and in a great location for the beach cities. I would HIGHLY recommend anyone flying into LAX book your room with the Sheraton Gateway 
Amazing time and would stay there again I stayed here for three days on a recent trip to Los Angeles for work. The hotel itself is nice but the room was not clean. When i checked in it seemed that the counters in the bathroom were not wiped down and there was a small knat in the sink. I killed the bug but the next morning there was another. 
I did not use the transportation to and from the airport so i dont know how well that worked. I have stayed at other hotels near LAX and they were cleaner and more comfortable. I would not stay here again. I stayed overnight in LA for an international flig

In [76]:
# get the ratings of the hotel we used for the query
grouped_df.iloc[query_id]

id                                                           82686
name                                         Hilton Woodland Hills
service                                                   4.367925
cleanliness                                               4.353774
overall                                                   4.179245
value                                                     4.018868
location                                                  4.254717
sleep_quality                                             4.292453
rooms                                                     4.089623
text             Last night the Canoga Park/West Hills Chamber ...
num_reviews                                                    212
Name: 420, dtype: object

In [77]:
# get the ratings of the hotel with the highest score
grouped_df.iloc[doc_id]

id                                                           78046
name                                  Sheraton Gateway Los Angeles
service                                                   4.192635
cleanliness                                               4.220963
overall                                                   4.053824
value                                                     3.998584
location                                                   4.38102
sleep_quality                                              4.20255
rooms                                                     4.050992
text             This trip was only a 24 hour lay over but want...
num_reviews                                                    706
Name: 157, dtype: object

### Evaluation

In [79]:
## calculate the mse of the ratings of the hotel we used for the query and the hotel with the highest score
import numpy as np
mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
print(mse)

0.012862058639548599


## Complete execution and evaluation of BM25 baseline

### Initialization

In [None]:
from rank_bm25 import BM25Okapi

corpus = grouped_df['text'].tolist()

tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

### Querying on 100 documents

In [None]:
# Selecting 100 random documents as queries
import random
random.seed(0)
query_ids = random.sample(range(len(corpus)), 100)