# Natural Language Processing  : First Project
# TripAdvisor Recommendation Challenge
# Beating BM25
#### **Santiago Martin & Léo Ringeissen**

## Data loading

In [1]:
import pandas as pd
offerings = pd.read_csv('data/offerings.csv',sep=',',header=0)
reviews = pd.read_csv('data/reviews.csv',sep=',',header=0)

In [2]:
offerings.head()

Unnamed: 0,hotel_class,region_id,url,phone,details,address,type,id,name
0,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '147 West 4...",hotel,113317,Casablanca Hotel Times Square
1,5.0,32655,http://www.tripadvisor.com/Hotel_Review-g32655...,,,"{'region': 'CA', 'street-address': '300 S Dohe...",hotel,76049,Four Seasons Hotel Los Angeles at Beverly Hills
2,3.5,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '790 Eighth...",hotel,99352,Hilton Garden Inn Times Square
3,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '152 West 5...",hotel,93589,The Michelangelo Hotel
4,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '130 West 4...",hotel,217616,The Muse Hotel New York


In [3]:
reviews.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False


## Data Preprocessing

#### Drop useless columns

In [4]:
cols_to_drop_offerings = ['hotel_class', 'region_id', 'url', 'phone', 'details', 'address', 'type']
offerings = offerings.drop(cols_to_drop_offerings, axis=1)

cols_to_drop_review = ['author', 'date_stayed', 'num_helpful_votes', 'date', 'id', 'via_mobile']
reviews = reviews.drop(cols_to_drop_review, axis=1)

#### Create rating columns

In [5]:
import pandas as pd
import ast
required_keys = ['service', 'cleanliness', 'overall', 'value', 'location', 'sleep_quality', 'rooms']

# Function to preprocess the ratings column
def preprocess_ratings(dataframe):
    # Convert the 'ratings' column from string to dictionary using ast.literal_eval
    dataframe['ratings_dict'] = dataframe['ratings'].apply(lambda x: ast.literal_eval(x))
    
    # Define the required keys
    
    # Extract only the required keys from the dictionary
    dataframe['filtered_ratings_dict'] = dataframe['ratings_dict'].apply(
        lambda d: {key: d[key] for key in required_keys if key in d}
    )
    
    # Normalize the filtered dictionary column into separate columns
    ratings_expanded = pd.json_normalize(dataframe['filtered_ratings_dict'])
    
    # Drop rows where any of the required keys are missing
    ratings_filtered = ratings_expanded.dropna(subset=required_keys)
    
    # Add the expanded columns back to the original dataframe
    dataframe = dataframe.join(ratings_filtered)
    
    # Drop the intermediate columns
    dataframe = dataframe.drop(columns=['ratings', 'ratings_dict', 'filtered_ratings_dict'])
    
    return dataframe

# Apply the preprocessing function
reviews = preprocess_ratings(reviews)


In [6]:
print(reviews.isnull().sum())
reviews.dropna(inplace=True)
print(reviews.isnull().sum())

title                 0
text                  0
offering_id           0
service          442170
cleanliness      442170
overall          442170
value            442170
location         442170
sleep_quality    442170
rooms            442170
dtype: int64
title            0
text             0
offering_id      0
service          0
cleanliness      0
overall          0
value            0
location         0
sleep_quality    0
rooms            0
dtype: int64


#### Joining offerings and reviews

In [7]:
# Merge the reviews and offerings dataframes on the offering_id and id columns
merged_df = pd.merge(reviews, offerings, left_on='offering_id', right_on='id')

# Group by the hotel id and name, and calculate the mean for the rating columns and count for the number of reviews
grouped_df = merged_df.groupby(['id', 'name']).agg(
    {cat: 'mean' for cat in required_keys} | {'text': ' '.join} | {'offering_id': 'count'}
).rename(columns={'offering_id': 'num_reviews'}).reset_index()

# Display the resulting dataframe
grouped_df.head()

Unnamed: 0,id,name,service,cleanliness,overall,value,location,sleep_quality,rooms,text,num_reviews
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,4.60101,4.636364,4.388889,4.323232,4.570707,4.333333,4.282828,I had to make fast visit to seattle and I foun...,198
1,72579,BEST WESTERN Loyal Inn,4.232,4.24,3.888,4.152,4.192,3.768,3.856,"Great service, rooms were clean, could use som...",125
2,72586,BEST WESTERN PLUS Executive Inn,4.25,4.287879,4.045455,4.05303,4.537879,4.113636,3.992424,Beautiful views of the space needle - especial...,132
3,72598,Comfort Inn & Suites Seattle,3.243243,3.243243,2.918919,3.054054,3.027027,3.27027,3.189189,This hotel is in need of some serious updates....,37
4,73236,Days Inn San Antonio/Near Lackland AFB,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222,My experience at this days inn was perfect. th...,18


In [8]:
print(grouped_df.shape)
display(grouped_df.head())

(3754, 11)


Unnamed: 0,id,name,service,cleanliness,overall,value,location,sleep_quality,rooms,text,num_reviews
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,4.60101,4.636364,4.388889,4.323232,4.570707,4.333333,4.282828,I had to make fast visit to seattle and I foun...,198
1,72579,BEST WESTERN Loyal Inn,4.232,4.24,3.888,4.152,4.192,3.768,3.856,"Great service, rooms were clean, could use som...",125
2,72586,BEST WESTERN PLUS Executive Inn,4.25,4.287879,4.045455,4.05303,4.537879,4.113636,3.992424,Beautiful views of the space needle - especial...,132
3,72598,Comfort Inn & Suites Seattle,3.243243,3.243243,2.918919,3.054054,3.027027,3.27027,3.189189,This hotel is in need of some serious updates....,37
4,73236,Days Inn San Antonio/Near Lackland AFB,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222,My experience at this days inn was perfect. th...,18


## Testing and evaluating BM25 with 100 random queries

### Initialization

In [9]:
from rank_bm25 import BM25Okapi

corpus = grouped_df['text'].tolist()
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

### Querying on 100 documents

In [21]:
import random

n_queries = 500 # For testing, we will use 10 queries
random.seed(0)
query_ids = random.sample(range(len(corpus)), n_queries)
print(query_ids)

[3458, 1577, 3104, 3646, 1722, 165, 1060, 2094, 1990, 1658, 3210, 3399, 1242, 1952, 1466, 2389, 3652, 3719, 894, 2067, 570, 1154, 572, 3095, 388, 2532, 3274, 1026, 3726, 2181, 2888, 3318, 2465, 3695, 601, 1270, 404, 2989, 302, 3681, 3483, 2801, 1352, 1933, 2292, 412, 1449, 1778, 1295, 2502, 2623, 3739, 837, 2263, 1953, 1813, 3544, 2135, 1066, 255, 3297, 2247, 3751, 57, 382, 2947, 3441, 1633, 2909, 3378, 3215, 2736, 2561, 4, 2506, 2021, 3391, 3553, 1364, 999, 2991, 1332, 2882, 3565, 257, 782, 2324, 908, 977, 3290, 583, 3289, 2224, 1834, 373, 329, 1310, 3584, 2080, 2004]


In [11]:
import numpy as np

mses = []
best_doc_ids = []
for query_id in query_ids:
    query = corpus[query_id]
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    doc_id = doc_scores.argsort()[-2]
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)
    
    
# Calculate the mean mse
mean_mse = np.mean(mses)
print(mean_mse)
print(best_doc_ids)



### NEW TEST


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Build TF-IDF matrix
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)

mses = []
best_doc_ids = []

# Iterate through random queries
for query_id in query_ids:
    query = corpus[query_id]
    query_vector = tfidf.transform([query])
    doc_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

mean_mse = np.mean(mses)
print(mean_mse)
print(best_doc_ids)

0.49517411338830664
[2571, 1041, 2783, 371, 927, 595, 1030, 512, 512, 1660, 3420, 2939, 1640, 963, 464, 960, 3003, 178, 1640, 2965, 571, 100, 573, 572, 2647, 3607, 14, 1622, 410, 95, 494, 181, 3296, 3580, 2143, 2139, 1790, 817, 364, 539, 2902, 179, 3104, 3296, 1083, 464, 3535, 1773, 946, 1217, 783, 2850, 2838, 933, 1640, 95, 636, 130, 464, 349, 3188, 1795, 715, 3253, 274, 3293, 1857, 3473, 115, 512, 394, 2288, 142, 179, 3570, 2305, 634, 213, 962, 2914, 1996, 1647, 3248, 591, 293, 2317, 2032, 3010, 2783, 665, 290, 95, 1984, 2546, 371, 293, 746, 3580, 1704, 3190]
