# Natural Language Processing  : First Project
# TripAdvisor Recommendation Challenge
# Beating BM25
#### **Santiago Martin & Léo Ringeissen**

## Data loading

In [1]:
import pandas as pd
offerings = pd.read_csv('data/offerings.csv',sep=',',header=0)
reviews = pd.read_csv('data/reviews.csv',sep=',',header=0)

In [2]:
offerings.head()

Unnamed: 0,hotel_class,region_id,url,phone,details,address,type,id,name
0,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '147 West 4...",hotel,113317,Casablanca Hotel Times Square
1,5.0,32655,http://www.tripadvisor.com/Hotel_Review-g32655...,,,"{'region': 'CA', 'street-address': '300 S Dohe...",hotel,76049,Four Seasons Hotel Los Angeles at Beverly Hills
2,3.5,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '790 Eighth...",hotel,99352,Hilton Garden Inn Times Square
3,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '152 West 5...",hotel,93589,The Michelangelo Hotel
4,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '130 West 4...",hotel,217616,The Muse Hotel New York


In [3]:
reviews.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False


## Data Preprocessing

#### Drop useless columns

In [4]:
cols_to_drop_offerings = ['hotel_class', 'region_id', 'url', 'phone', 'details', 'address', 'type']
offerings = offerings.drop(cols_to_drop_offerings, axis=1)

cols_to_drop_review = ['author', 'date_stayed', 'num_helpful_votes', 'date', 'id', 'via_mobile']
reviews = reviews.drop(cols_to_drop_review, axis=1)

#### Create rating columns

In [5]:
import pandas as pd
import ast
required_keys = ['service', 'cleanliness', 'overall', 'value', 'location', 'sleep_quality', 'rooms']

# Function to preprocess the ratings column
def preprocess_ratings(dataframe):
    # Convert the 'ratings' column from string to dictionary using ast.literal_eval
    dataframe['ratings_dict'] = dataframe['ratings'].apply(lambda x: ast.literal_eval(x))
    
    # Define the required keys
    
    # Extract only the required keys from the dictionary
    dataframe['filtered_ratings_dict'] = dataframe['ratings_dict'].apply(
        lambda d: {key: d[key] for key in required_keys if key in d}
    )
    
    # Normalize the filtered dictionary column into separate columns
    ratings_expanded = pd.json_normalize(dataframe['filtered_ratings_dict'])
    
    # Drop rows where any of the required keys are missing
    ratings_filtered = ratings_expanded.dropna(subset=required_keys)
    
    # Add the expanded columns back to the original dataframe
    dataframe = dataframe.join(ratings_filtered)
    
    # Drop the intermediate columns
    dataframe = dataframe.drop(columns=['ratings', 'ratings_dict', 'filtered_ratings_dict'])
    
    return dataframe

# Apply the preprocessing function
reviews = preprocess_ratings(reviews)


In [6]:
print(reviews.isnull().sum())
reviews.dropna(inplace=True)
print(reviews.isnull().sum())

title                 0
text                  0
offering_id           0
service          442170
cleanliness      442170
overall          442170
value            442170
location         442170
sleep_quality    442170
rooms            442170
dtype: int64
title            0
text             0
offering_id      0
service          0
cleanliness      0
overall          0
value            0
location         0
sleep_quality    0
rooms            0
dtype: int64


#### Joining offerings and reviews

In [7]:
# Merge the reviews and offerings dataframes on the offering_id and id columns
merged_df = pd.merge(reviews, offerings, left_on='offering_id', right_on='id')

# Group by the hotel id and name, and calculate the mean for the rating columns and count for the number of reviews
grouped_df = merged_df.groupby(['id', 'name']).agg(
    {cat: 'mean' for cat in required_keys} | {'text': ' '.join} | {'offering_id': 'count'}
).rename(columns={'offering_id': 'num_reviews'}).reset_index()

# Display the resulting dataframe
grouped_df.head()

Unnamed: 0,id,name,service,cleanliness,overall,value,location,sleep_quality,rooms,text,num_reviews
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,4.60101,4.636364,4.388889,4.323232,4.570707,4.333333,4.282828,I had to make fast visit to seattle and I foun...,198
1,72579,BEST WESTERN Loyal Inn,4.232,4.24,3.888,4.152,4.192,3.768,3.856,"Great service, rooms were clean, could use som...",125
2,72586,BEST WESTERN PLUS Executive Inn,4.25,4.287879,4.045455,4.05303,4.537879,4.113636,3.992424,Beautiful views of the space needle - especial...,132
3,72598,Comfort Inn & Suites Seattle,3.243243,3.243243,2.918919,3.054054,3.027027,3.27027,3.189189,This hotel is in need of some serious updates....,37
4,73236,Days Inn San Antonio/Near Lackland AFB,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222,My experience at this days inn was perfect. th...,18


In [8]:
print(grouped_df.shape)
display(grouped_df.head())

(3754, 11)


Unnamed: 0,id,name,service,cleanliness,overall,value,location,sleep_quality,rooms,text,num_reviews
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,4.60101,4.636364,4.388889,4.323232,4.570707,4.333333,4.282828,I had to make fast visit to seattle and I foun...,198
1,72579,BEST WESTERN Loyal Inn,4.232,4.24,3.888,4.152,4.192,3.768,3.856,"Great service, rooms were clean, could use som...",125
2,72586,BEST WESTERN PLUS Executive Inn,4.25,4.287879,4.045455,4.05303,4.537879,4.113636,3.992424,Beautiful views of the space needle - especial...,132
3,72598,Comfort Inn & Suites Seattle,3.243243,3.243243,2.918919,3.054054,3.027027,3.27027,3.189189,This hotel is in need of some serious updates....,37
4,73236,Days Inn San Antonio/Near Lackland AFB,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222,My experience at this days inn was perfect. th...,18


## Testing and evaluating BM25 with 100 random queries

### Initialization

In [9]:
from rank_bm25 import BM25Okapi

corpus = grouped_df['text'].tolist()
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

### Querying on N documents

In [10]:
import random

n_queries = 50 # For testing, we will use 10 queries
random.seed(42)
query_ids = random.sample(range(len(corpus)), n_queries)
print(query_ids)

[2619, 456, 102, 3037, 1126, 1003, 914, 571, 3016, 419, 2771, 3033, 3654, 2233, 356, 2418, 1728, 130, 122, 383, 895, 952, 2069, 2465, 108, 2298, 814, 2932, 2661, 2872, 2232, 1718, 902, 1839, 2413, 1139, 3315, 3560, 26, 3108, 3300, 653, 2859, 1731, 1393, 1138, 636, 881, 3127, 1378]


In [64]:
import numpy as np

mses = []
best_doc_ids = []

for query_id in query_ids:
    query = corpus[query_id]
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    doc_id = doc_scores.argsort()[-2]
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)
    
# Calculate the mean mse
mean_mse = np.mean(mses)
print(mean_mse)
print(best_doc_ids)

0.5775039596383706
[2727, 867, 546, 718, 598, 232, 2657, 1833, 244, 163, 832, 814, 3020, 2239, 371, 2886, 239, 149, 551, 316, 1619, 156, 1310, 3296, 1761, 601, 1156, 1622, 3192, 3081, 232, 3648, 531, 2023, 316, 1144, 2023, 137, 1010, 2023, 3188, 718, 436, 341, 1622, 2639, 607, 535, 718, 2850]


**Results**
- Score: 0.5775039596383706
- Best IDs: [2727, 867, 546, 718, 598, 232, 2657, 1833, 244, 163, 832, 814, 3020, 2239, 371, 2886, 239, 149, 551, 316, 1619, 156, 1310, 3296, 1761, 601, 1156, 1622, 3192, 3081, 232, 3648, 531, 2023, 316, 1144, 2023, 137, 1010, 2023, 3188, 718, 436, 341, 1622, 2639, 607, 535, 718, 2850]

### With NLP preprocessing

In [19]:
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    regex_tokenizer = RegexpTokenizer('\w\w+')
    text = regex_tokenizer.tokenize(text)
    text = [word for word in text if word not in stopwords]
    stemmer = PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    return text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ringi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ringi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
processed_corpus = [clean_text(doc) for doc in corpus]
#processed_tokenized_corpus = [doc.split(" ") for doc in processed_corpus]
#processed_bm25 = BM25Okapi(processed_tokenized_corpus)

In [67]:
import numpy as np

processed_mses = []
processed_best_doc_ids = []

for query_id in query_ids:
    query = processed_corpus[query_id]
    tokenized_query = query.split(" ")
    doc_scores = processed_bm25.get_scores(tokenized_query)
    doc_id = doc_scores.argsort()[-2]
    processed_best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    processed_mses.append(mse)
    
# Calculate the mean mse
processed_mean_mse = np.mean(processed_mses)
print(processed_mean_mse)
print(processed_best_doc_ids)

0.4264256713920574
[2727, 960, 546, 3603, 615, 239, 1874, 1833, 244, 2023, 832, 1276, 3020, 714, 371, 2886, 239, 149, 551, 316, 2618, 383, 1310, 3296, 1761, 1622, 1156, 1761, 1683, 3081, 1010, 3648, 1761, 573, 115, 1144, 1622, 115, 1010, 539, 3188, 718, 3258, 341, 1622, 2639, 3603, 1606, 3020, 3656]


**Results**
- Score: 0.4264256713920574
- Best IDs: [2727, 960, 546, 3603, 615, 239, 1874, 1833, 244, 2023, 832, 1276, 3020, 714, 371, 2886, 239, 149, 551, 316, 2618, 383, 1310, 3296, 1761, 1622, 1156, 1761, 1683, 3081, 1010, 3648, 1761, 573, 115, 1144, 1622, 115, 1010, 539, 3188, 718, 3258, 341, 1622, 2639, 3603, 1606, 3020, 3656]

# T5 TEST

In [30]:
from transformers import T5Tokenizer, T5Model
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import sentencepiece
import torch

# Load T5 model and tokenizer
model_name = "t5-base"  # Change to "t5-base" or "t5-large" for better results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5Model.from_pretrained(model_name)

# Ensure reproducibility
random.seed(42)
np.random.seed(42)

# Function to encode text into embeddings using T5
def encode_text(text, max_length=512):
    # Tokenize and prepare inputs
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    # Pass inputs through T5 encoder to get embeddings
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    # Mean pooling of the token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Preprocess corpus to get embeddings
print("Encoding documents into embeddings...")
document_embeddings = np.array([encode_text(doc) for doc in corpus])




spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Encoding documents into embeddings...


In [32]:
# Generate random queries
n_queries = 100 # For testing, we will use 10 queries
random.seed(42)
query_ids = random.sample(range(len(corpus)), n_queries)
print(query_ids)

mses = []
best_doc_ids = []

# Iterate through random queries
print("Processing queries...")
for query_id in query_ids:
    query = corpus[query_id]
    query_embedding = encode_text(query)
    
    # Compute cosine similarities
    doc_scores = cosine_similarity([query_embedding], document_embeddings).flatten()
    
    # Get the second-highest score
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    
    # Calculate MSE for evaluation
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

# Calculate the mean MSE
mean_mse = np.mean(mses)
print(f"Mean MSE: {mean_mse}")
print("Best doc IDs:", best_doc_ids)

[2619, 456, 102, 3037, 1126, 1003, 914, 571, 3016, 419, 2771, 3033, 3654, 2233, 356, 2418, 1728, 130, 122, 383, 895, 952, 2069, 2465, 108, 2298, 814, 2932, 2661, 2872, 2232, 1718, 902, 1839, 2413, 1139, 3315, 3560, 26, 3108, 3300, 653, 2859, 1731, 1393, 1138, 636, 881, 3127, 1378, 418, 379, 1556, 396, 1470, 3471, 1408, 2472, 1083, 3305, 177, 2988, 1881, 2196, 511, 1550, 322, 2261, 1200, 3397, 2574, 2533, 3626, 3529, 1481, 2364, 787, 2885, 284, 187, 2708, 933, 3166, 1185, 326, 3503, 953, 3549, 413, 1857, 2603, 3416, 1494, 666, 1516, 1455, 858, 2745, 1093, 2874]
Processing queries...
Mean MSE: 0.5515936991289846
Best doc IDs: [2639, 1433, 1945, 504, 2100, 1801, 2631, 3638, 1732, 3606, 3084, 1491, 639, 1887, 200, 3080, 1028, 2957, 1711, 354, 3584, 2978, 3313, 2750, 1182, 644, 114, 2772, 2541, 470, 1417, 2348, 305, 1053, 430, 2307, 2655, 2530, 628, 2928, 0, 2141, 83, 249, 181, 436, 594, 557, 754, 126, 1303, 530, 1404, 2962, 1523, 1971, 372, 3191, 2969, 3503, 1438, 3521, 645, 594, 165, 1317

- T5 small no preprocess Mean MSE: 0.47295596331040385
- T5 small with preprocess Mean MSE: 0.48910285125897723
- T5 base with no preprocess Mean MSE: 0.51


100 queries :
- BM25 baseline
- BM25 preprocessed
- TF-IDF baseline 
- TF-IDF preprocessed 
- BERT baseline
- BERT preprocessed
- T5 small baseline
- T5 small preprocessed
- T5 base  : 0.5515936991289846
- T5 base preprocessed : no



In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Build TF-IDF matrix
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)

mses = []
best_doc_ids = []

# Iterate through random queries
for query_id in query_ids:
    query = corpus[query_id]
    query_vector = tfidf.transform([query])
    doc_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

mean_mse = np.mean(mses)
print(mean_mse)
print(best_doc_ids)


0.5361740321084517
[1602, 704, 2365, 767, 598, 2735, 1874, 1833, 2610, 179, 2838, 3529, 3351, 2827, 371, 178, 110, 1640, 3211, 1774, 1277, 253, 95, 3296, 405, 2580, 3484, 3077, 3247, 3081, 2727, 3648, 1646, 1831, 3356, 1144, 3231, 2662, 2889, 179, 3188, 756, 591, 364, 962, 1140, 640, 1277, 179, 2850]
