# Natural Language Processing  : First Project
# TripAdvisor Recommendation Challenge
# Beating BM25
#### **Santiago Martin & Léo Ringeissen**

## Data loading

In [2]:
import pandas as pd
offerings = pd.read_csv('data/offerings.csv',sep=',',header=0)
reviews = pd.read_csv('data/reviews.csv',sep=',',header=0)

In [3]:
offerings.head()

Unnamed: 0,hotel_class,region_id,url,phone,details,address,type,id,name
0,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '147 West 4...",hotel,113317,Casablanca Hotel Times Square
1,5.0,32655,http://www.tripadvisor.com/Hotel_Review-g32655...,,,"{'region': 'CA', 'street-address': '300 S Dohe...",hotel,76049,Four Seasons Hotel Los Angeles at Beverly Hills
2,3.5,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '790 Eighth...",hotel,99352,Hilton Garden Inn Times Square
3,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '152 West 5...",hotel,93589,The Michelangelo Hotel
4,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '130 West 4...",hotel,217616,The Muse Hotel New York


In [4]:
reviews.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False


## Data Preprocessing

#### Drop useless columns

In [5]:
cols_to_drop_offerings = ['hotel_class', 'region_id', 'url', 'phone', 'details', 'address', 'type']
offerings = offerings.drop(cols_to_drop_offerings, axis=1)

cols_to_drop_review = ['author', 'date_stayed', 'num_helpful_votes', 'date', 'id', 'via_mobile']
reviews = reviews.drop(cols_to_drop_review, axis=1)

#### Create rating columns

In [6]:
import pandas as pd
import ast
required_keys = ['service', 'cleanliness', 'overall', 'value', 'location', 'sleep_quality', 'rooms']

# Function to preprocess the ratings column
def preprocess_ratings(dataframe):
    # Convert the 'ratings' column from string to dictionary using ast.literal_eval
    dataframe['ratings_dict'] = dataframe['ratings'].apply(lambda x: ast.literal_eval(x))
    
    # Define the required keys
    
    # Extract only the required keys from the dictionary
    dataframe['filtered_ratings_dict'] = dataframe['ratings_dict'].apply(
        lambda d: {key: d[key] for key in required_keys if key in d}
    )
    
    # Normalize the filtered dictionary column into separate columns
    ratings_expanded = pd.json_normalize(dataframe['filtered_ratings_dict'])
    
    # Drop rows where any of the required keys are missing
    ratings_filtered = ratings_expanded.dropna(subset=required_keys)
    
    # Add the expanded columns back to the original dataframe
    dataframe = dataframe.join(ratings_filtered)
    
    # Drop the intermediate columns
    dataframe = dataframe.drop(columns=['ratings', 'ratings_dict', 'filtered_ratings_dict'])
    
    return dataframe

# Apply the preprocessing function
reviews = preprocess_ratings(reviews)


In [7]:
print(reviews.isnull().sum())
reviews.dropna(inplace=True)
print(reviews.isnull().sum())

title                 0
text                  0
offering_id           0
service          442170
cleanliness      442170
overall          442170
value            442170
location         442170
sleep_quality    442170
rooms            442170
dtype: int64
title            0
text             0
offering_id      0
service          0
cleanliness      0
overall          0
value            0
location         0
sleep_quality    0
rooms            0
dtype: int64


#### Joining offerings and reviews

In [8]:
# Merge the reviews and offerings dataframes on the offering_id and id columns
merged_df = pd.merge(reviews, offerings, left_on='offering_id', right_on='id')

# Group by the hotel id and name, and calculate the mean for the rating columns and count for the number of reviews
grouped_df = merged_df.groupby(['id', 'name']).agg(
    {cat: 'mean' for cat in required_keys} | {'text': ' '.join} | {'offering_id': 'count'}
).rename(columns={'offering_id': 'num_reviews'}).reset_index()

# Display the resulting dataframe
grouped_df.head()

Unnamed: 0,id,name,service,cleanliness,overall,value,location,sleep_quality,rooms,text,num_reviews
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,4.60101,4.636364,4.388889,4.323232,4.570707,4.333333,4.282828,I had to make fast visit to seattle and I foun...,198
1,72579,BEST WESTERN Loyal Inn,4.232,4.24,3.888,4.152,4.192,3.768,3.856,"Great service, rooms were clean, could use som...",125
2,72586,BEST WESTERN PLUS Executive Inn,4.25,4.287879,4.045455,4.05303,4.537879,4.113636,3.992424,Beautiful views of the space needle - especial...,132
3,72598,Comfort Inn & Suites Seattle,3.243243,3.243243,2.918919,3.054054,3.027027,3.27027,3.189189,This hotel is in need of some serious updates....,37
4,73236,Days Inn San Antonio/Near Lackland AFB,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222,My experience at this days inn was perfect. th...,18


In [9]:
print(grouped_df.shape)
display(grouped_df.head())

(3754, 11)


Unnamed: 0,id,name,service,cleanliness,overall,value,location,sleep_quality,rooms,text,num_reviews
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,4.60101,4.636364,4.388889,4.323232,4.570707,4.333333,4.282828,I had to make fast visit to seattle and I foun...,198
1,72579,BEST WESTERN Loyal Inn,4.232,4.24,3.888,4.152,4.192,3.768,3.856,"Great service, rooms were clean, could use som...",125
2,72586,BEST WESTERN PLUS Executive Inn,4.25,4.287879,4.045455,4.05303,4.537879,4.113636,3.992424,Beautiful views of the space needle - especial...,132
3,72598,Comfort Inn & Suites Seattle,3.243243,3.243243,2.918919,3.054054,3.027027,3.27027,3.189189,This hotel is in need of some serious updates....,37
4,73236,Days Inn San Antonio/Near Lackland AFB,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222,My experience at this days inn was perfect. th...,18


## Preparing 100 random queries and preprocessing

### Initialization

In [10]:

corpus = grouped_df['text'].tolist()
tokenized_corpus = [doc.split(" ") for doc in corpus]

### Querying on N documents

In [11]:
import random

n_queries = 100 # For testing, we will use 10 queries
random.seed(42)
query_ids = random.sample(range(len(corpus)), n_queries)
print(query_ids)

[2619, 456, 102, 3037, 1126, 1003, 914, 571, 3016, 419, 2771, 3033, 3654, 2233, 356, 2418, 1728, 130, 122, 383, 895, 952, 2069, 2465, 108, 2298, 814, 2932, 2661, 2872, 2232, 1718, 902, 1839, 2413, 1139, 3315, 3560, 26, 3108, 3300, 653, 2859, 1731, 1393, 1138, 636, 881, 3127, 1378, 418, 379, 1556, 396, 1470, 3471, 1408, 2472, 1083, 3305, 177, 2988, 1881, 2196, 511, 1550, 322, 2261, 1200, 3397, 2574, 2533, 3626, 3529, 1481, 2364, 787, 2885, 284, 187, 2708, 933, 3166, 1185, 326, 3503, 953, 3549, 413, 1857, 2603, 3416, 1494, 666, 1516, 1455, 858, 2745, 1093, 2874]


### NLP preprocessing

In [None]:
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    regex_tokenizer = RegexpTokenizer('\w\w+')
    text = regex_tokenizer.tokenize(text)
    text = [word for word in text if word not in stopwords]
    stemmer = PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    return text

In [None]:
processed_corpus = [clean_text(doc) for doc in corpus]
processed_tokenized_corpus = [doc.split(" ") for doc in processed_corpus]

## BM25

### Baseline

In [None]:
import numpy as np
from rank_bm25 import BM25Okapi


bm25 = BM25Okapi(tokenized_corpus)

mses = []
best_doc_ids = []

for query_id in query_ids:
    query = corpus[query_id]
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    doc_id = doc_scores.argsort()[-2]
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)
    
# Calculate the mean mse
mean_mse_bm25_baseline = np.mean(mses)
print(mean_mse_bm25_baseline)
print(best_doc_ids)

0.5390851523430844
[2727, 867, 546, 718, 598, 232, 2657, 1833, 244, 163, 832, 814, 3020, 2239, 371, 2886, 239, 149, 551, 316, 1619, 156, 1310, 3296, 1761, 601, 1156, 1622, 3192, 3081, 232, 3648, 531, 2023, 316, 1144, 2023, 137, 1010, 2023, 3188, 718, 436, 341, 1622, 2639, 607, 535, 718, 2850, 489, 208, 1622, 371, 404, 1717, 3081, 464, 3140, 1222, 460, 1691, 239, 607, 509, 426, 464, 1619, 239, 157, 239, 1334, 2639, 1761, 2023, 382, 1156, 2032, 323, 3656, 1606, 607, 179, 1222, 221, 1622, 1606, 3579, 555, 489, 3020, 1640, 601, 1334, 1010, 1192, 841, 2431, 323, 804]


**Results**
- Score: 0.5390851523430844
- Best IDs: [2727, 867, 546, 718, 598, 232, 2657, 1833, 244, 163, 832, 814, 3020, 2239, 371, 2886, 239, 149, 551, 316, 1619, 156, 1310, 3296, 1761, 601, 1156, 1622, 3192, 3081, 232, 3648, 531, 2023, 316, 1144, 2023, 137, 1010, 2023, 3188, 718, 436, 341, 1622, 2639, 607, 535, 718, 2850, 489, 208, 1622, 371, 404, 1717, 3081, 464, 3140, 1222, 460, 1691, 239, 607, 509, 426, 464, 1619, 239, 157, 239, 1334, 2639, 1761, 2023, 382, 1156, 2032, 323, 3656, 1606, 607, 179, 1222, 221, 1622, 1606, 3579, 555, 489, 3020, 1640, 601, 1334, 1010, 1192, 841, 2431, 323, 804]


### With NLP preprocessing

In [None]:
import numpy as np

processed_bm25 = BM25Okapi(processed_tokenized_corpus)

processed_mses = []
processed_best_doc_ids = []

for query_id in query_ids:
    query = processed_corpus[query_id]
    tokenized_query = query.split(" ")
    doc_scores = processed_bm25.get_scores(tokenized_query)
    doc_id = doc_scores.argsort()[-2]
    processed_best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    processed_mses.append(mse)
    
# Calculate the mean mse
mean_mse_bm25_preprocessed = np.mean(processed_mses)
print(mean_mse_bm25_preprocessed)
print(processed_best_doc_ids)

0.418855258103762
[2727, 960, 546, 3603, 615, 239, 1874, 1833, 244, 2023, 832, 1276, 3020, 714, 371, 2886, 239, 149, 551, 316, 2618, 383, 1310, 3296, 1761, 1622, 1156, 1761, 1683, 3081, 1010, 3648, 1761, 573, 115, 1144, 1622, 115, 1010, 539, 3188, 718, 3258, 341, 1622, 2639, 3603, 1606, 3020, 3656, 3442, 208, 1622, 371, 867, 2105, 601, 1310, 1719, 1222, 460, 1691, 1691, 895, 506, 573, 464, 239, 2882, 157, 867, 1334, 1622, 1761, 817, 260, 824, 685, 323, 3656, 1606, 607, 2239, 1222, 221, 1622, 867, 1266, 555, 489, 3020, 3188, 1622, 1334, 1010, 3156, 841, 2431, 1619, 804]


**Results**
- Score: 0.418855258103762

- Best IDs: [2727, 960, 546, 3603, 615, 239, 1874, 1833, 244, 2023, 832, 1276, 3020, 714, 371, 2886, 239, 149, 551, 316, 2618, 383, 1310, 3296, 1761, 1622, 1156, 1761, 1683, 3081, 1010, 3648, 1761, 573, 115, 1144, 1622, 115, 1010, 539, 3188, 718, 3258, 341, 1622, 2639, 3603, 1606, 3020, 3656, 3442, 208, 1622, 371, 867, 2105, 601, 1310, 1719, 1222, 460, 1691, 1691, 895, 506, 573, 464, 239, 2882, 157, 867, 1334, 1622, 1761, 817, 260, 824, 685, 323, 3656, 1606, 607, 2239, 1222, 221, 1622, 867, 1266, 555, 489, 3020, 3188, 1622, 1334, 1010, 3156, 841, 2431, 1619, 804]


## Testing TF-IDF

### No Preprocessing

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Build TF-IDF matrix
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)

mses = []
best_doc_ids = []

# Iterate through random queries
for query_id in query_ids:
    query = corpus[query_id]
    query_vector = tfidf.transform([query])
    doc_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

mean_mse_tf_baseline = np.mean(mses)
print(mean_mse_tf_baseline)
print(best_doc_ids)


0.4926256741935092
[1602, 704, 2365, 767, 598, 2735, 1874, 1833, 2610, 179, 2838, 3529, 3351, 2827, 371, 178, 110, 1640, 3211, 1774, 1277, 253, 95, 3296, 405, 2580, 3484, 3077, 3247, 3081, 2727, 3648, 1646, 1831, 3356, 1144, 3231, 2662, 2889, 179, 3188, 756, 591, 364, 962, 1140, 640, 1277, 179, 2850, 962, 378, 2139, 371, 3607, 2105, 179, 464, 1536, 2735, 1790, 1996, 1640, 892, 512, 464, 95, 3124, 2226, 1749, 464, 1053, 1984, 1413, 2370, 285, 2437, 3391, 298, 100, 2258, 636, 3433, 1189, 221, 2570, 3607, 1273, 128, 512, 3351, 3188, 3209, 1640, 3190, 3572, 100, 1606, 1028, 2032]


**Results**
- Score: 0.4926256741935092

- Best IDs: [1602, 704, 2365, 767, 598, 2735, 1874, 1833, 2610, 179, 2838, 3529, 3351, 2827, 371, 178, 110, 1640, 3211, 1774, 1277, 253, 95, 3296, 405, 2580, 3484, 3077, 3247, 3081, 2727, 3648, 1646, 1831, 3356, 1144, 3231, 2662, 2889, 179, 3188, 756, 591, 364, 962, 1140, 640, 1277, 179, 2850, 962, 378, 2139, 371, 3607, 2105, 179, 464, 1536, 2735, 1790, 1996, 1640, 892, 512, 464, 95, 3124, 2226, 1749, 464, 1053, 1984, 1413, 2370, 285, 2437, 3391, 298, 100, 2258, 636, 3433, 1189, 221, 2570, 3607, 1273, 128, 512, 3351, 3188, 3209, 1640, 3190, 3572, 100, 1606, 1028, 2032]


### With preprocessing

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Build TF-IDF matrix
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(processed_corpus)

mses = []
best_doc_ids = []

# Iterate through random queries
for query_id in query_ids:
    query = processed_corpus[query_id]
    query_vector = tfidf.transform([query])
    doc_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

mean_mse_tf_preprocessed = np.mean(mses)
print(mean_mse_tf_preprocessed)
print(best_doc_ids)


0.4355368491613478
[1602, 937, 1094, 767, 2906, 1006, 1874, 1833, 2610, 178, 2838, 3529, 3351, 2571, 371, 221, 110, 817, 3211, 1774, 892, 253, 2107, 3296, 1597, 2580, 3484, 3077, 3247, 3081, 2727, 3648, 1646, 1831, 825, 1144, 3145, 3108, 2107, 1795, 3188, 756, 290, 3125, 1087, 2730, 1041, 1277, 3336, 2850, 2602, 378, 2283, 371, 3607, 2279, 2426, 943, 1536, 2244, 1790, 1749, 1702, 892, 512, 95, 2643, 3124, 2244, 1884, 95, 1882, 1984, 1413, 3389, 2428, 2437, 1225, 1982, 100, 19, 636, 3433, 1189, 248, 2570, 2643, 2362, 128, 512, 3351, 3188, 2577, 3145, 3190, 3572, 833, 2431, 1028, 2032]


**Results**
- Score: 0.4355368491613478

- Best IDs: [1602, 937, 1094, 767, 2906, 1006, 1874, 1833, 2610, 178, 2838, 3529, 3351, 2571, 371, 221, 110, 817, 3211, 1774, 892, 253, 2107, 3296, 1597, 2580, 3484, 3077, 3247, 3081, 2727, 3648, 1646, 1831, 825, 1144, 3145, 3108, 2107, 1795, 3188, 756, 290, 3125, 1087, 2730, 1041, 1277, 3336, 2850, 2602, 378, 2283, 371, 3607, 2279, 2426, 943, 1536, 2244, 1790, 1749, 1702, 892, 512, 95, 2643, 3124, 2244, 1884, 95, 1882, 1984, 1413, 3389, 2428, 2437, 1225, 1982, 100, 19, 636, 3433, 1189, 248, 2570, 2643, 2362, 128, 512, 3351, 3188, 2577, 3145, 3190, 3572, 833, 2431, 1028, 2032]



# Flan T5

## Small model

In [13]:
from transformers import T5Tokenizer, T5Model
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import sentencepiece
import torch




In [14]:
from transformers import T5Tokenizer, T5Model
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import sentencepiece
import torch

# Load T5 model and tokenizer
model_name = "t5-small"  # Change to "t5-base" or "t5-large" for better results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5Model.from_pretrained(model_name)

# Ensure reproducibility
random.seed(42)
np.random.seed(42)

# Function to encode text into embeddings using T5
def encode_text(text, max_length=512):
    # Tokenize and prepare inputs
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    # Pass inputs through T5 encoder to get embeddings
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    # Mean pooling of the token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Preprocess corpus to get embeddings
print("Encoding documents into embeddings...")
document_embeddings = np.array([encode_text(doc) for doc in corpus])

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Encoding documents into embeddings...


In [15]:
mses = []
best_doc_ids = []

# Iterate through random queries
print("Processing queries...")
for query_id in query_ids:
    query = corpus[query_id]
    query_embedding = encode_text(query)
    
    # Compute cosine similarities
    doc_scores = cosine_similarity([query_embedding], document_embeddings).flatten()
    
    # Get the second-highest score
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    
    # Calculate MSE for evaluation
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

# Calculate the mean MSE
mean_mse_t5s_baseline = np.mean(mses)
print(f"Mean MSE: {mean_mse_t5s_baseline}")
print("Best doc IDs:", best_doc_ids)

Processing queries...
Mean MSE: 0.6340350221023886
Best doc IDs: [2333, 560, 1016, 945, 1796, 594, 657, 1619, 163, 1488, 3702, 670, 3454, 1122, 310, 141, 2303, 2229, 540, 1724, 1713, 1369, 48, 2208, 1182, 3462, 114, 1599, 2541, 2344, 1625, 2348, 1919, 3702, 62, 2074, 3643, 2530, 504, 364, 623, 856, 1978, 2788, 1126, 287, 3421, 197, 713, 38, 2038, 2829, 2125, 69, 951, 170, 832, 821, 3437, 2998, 2663, 3521, 461, 3659, 1904, 864, 3208, 2510, 2880, 754, 442, 2605, 3463, 499, 1544, 1161, 830, 3748, 695, 699, 1791, 2681, 2434, 1974, 1475, 2003, 1070, 978, 2679, 1796, 2702, 3404, 998, 1872, 2151, 1494, 2581, 37, 2062, 1550]


**Results**
- Score: 0.6340350221023886

- Best IDs: [2333, 560, 1016, 945, 1796, 594, 657, 1619, 163, 1488, 3702, 670, 3454, 1122, 310, 141, 2303, 2229, 540, 1724, 1713, 1369, 48, 2208, 1182, 3462, 114, 1599, 2541, 2344, 1625, 2348, 1919, 3702, 62, 2074, 3643, 2530, 504, 364, 623, 856, 1978, 2788, 1126, 287, 3421, 197, 713, 38, 2038, 2829, 2125, 69, 951, 170, 832, 821, 3437, 2998, 2663, 3521, 461, 3659, 1904, 864, 3208, 2510, 2880, 754, 442, 2605, 3463, 499, 1544, 1161, 830, 3748, 695, 699, 1791, 2681, 2434, 1974, 1475, 2003, 1070, 978, 2679, 1796, 2702, 3404, 998, 1872, 2151, 1494, 2581, 37, 2062, 1550]



### Preprocessed

In [16]:

# Load T5 model and tokenizer
model_name = "t5-small"  # Change to "t5-base" or "t5-large" for better results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5Model.from_pretrained(model_name)

# Ensure reproducibility
random.seed(42)
np.random.seed(42)

# Function to encode text into embeddings using T5
def encode_text(text, max_length=512):
    # Tokenize and prepare inputs
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    # Pass inputs through T5 encoder to get embeddings
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    # Mean pooling of the token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Preprocess corpus to get embeddings
print("Encoding documents into embeddings...")
document_embeddings = np.array([encode_text(doc) for doc in processed_corpus])

Encoding documents into embeddings...


In [17]:
mses = []
best_doc_ids = []

# Iterate through random queries
print("Processing queries...")
for query_id in query_ids:
    query = processed_corpus[query_id]
    query_embedding = encode_text(query)
    
    # Compute cosine similarities
    doc_scores = cosine_similarity([query_embedding], document_embeddings).flatten()
    
    # Get the second-highest score
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    
    # Calculate MSE for evaluation
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

# Calculate the mean MSE
mean_mse_t5s_preprocessed = np.mean(mses)
print(f"Mean MSE: {mean_mse_t5s_preprocessed}")
print("Best doc IDs:", best_doc_ids)

Processing queries...
Mean MSE: 0.5792905873745835
Best doc IDs: [2242, 1123, 406, 2810, 2955, 2438, 1446, 2343, 167, 45, 1155, 2084, 3564, 663, 639, 3169, 1604, 1148, 551, 959, 1606, 1369, 943, 2304, 1239, 1262, 1228, 3281, 2184, 3453, 902, 3139, 2232, 3596, 3526, 959, 2665, 762, 2122, 962, 1027, 1846, 1831, 607, 3314, 1262, 110, 1288, 3092, 3052, 2460, 3226, 2086, 2010, 766, 2783, 1264, 2928, 596, 159, 2983, 2082, 1559, 3029, 800, 936, 430, 820, 1004, 3438, 853, 1820, 3682, 458, 1439, 3450, 830, 3549, 493, 1156, 904, 2086, 2810, 2783, 1850, 3564, 2204, 3326, 1997, 2783, 2153, 1735, 1496, 2508, 506, 581, 935, 698, 2092, 2435]


**Results**
- Score: 0.5792905873745835

- Best IDs: [2242, 1123, 406, 2810, 2955, 2438, 1446, 2343, 167, 45, 1155, 2084, 3564, 663, 639, 3169, 1604, 1148, 551, 959, 1606, 1369, 943, 2304, 1239, 1262, 1228, 3281, 2184, 3453, 902, 3139, 2232, 3596, 3526, 959, 2665, 762, 2122, 962, 1027, 1846, 1831, 607, 3314, 1262, 110, 1288, 3092, 3052, 2460, 3226, 2086, 2010, 766, 2783, 1264, 2928, 596, 159, 2983, 2082, 1559, 3029, 800, 936, 430, 820, 1004, 3438, 853, 1820, 3682, 458, 1439, 3450, 830, 3549, 493, 1156, 904, 2086, 2810, 2783, 1850, 3564, 2204, 3326, 1997, 2783, 2153, 1735, 1496, 2508, 506, 581, 935, 698, 2092, 2435]



## T5 Base

In [18]:
from transformers import T5Tokenizer, T5Model
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import sentencepiece
import torch

# Load T5 model and tokenizer
model_name = "t5-base"  # Change to "t5-base" or "t5-large" for better results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5Model.from_pretrained(model_name)

# Ensure reproducibility
random.seed(42)
np.random.seed(42)

# Function to encode text into embeddings using T5
def encode_text(text, max_length=512):
    # Tokenize and prepare inputs
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    # Pass inputs through T5 encoder to get embeddings
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    # Mean pooling of the token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Preprocess corpus to get embeddings
print("Encoding documents into embeddings...")
document_embeddings = np.array([encode_text(doc) for doc in corpus])

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Encoding documents into embeddings...


In [19]:
mses = []
best_doc_ids = []

# Iterate through random queries
print("Processing queries...")
for query_id in query_ids:
    query = corpus[query_id]
    query_embedding = encode_text(query)
    
    # Compute cosine similarities
    doc_scores = cosine_similarity([query_embedding], document_embeddings).flatten()
    
    # Get the second-highest score
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    
    # Calculate MSE for evaluation
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

# Calculate the mean MSE
mean_mse_t5b_baseline = np.mean(mses)
print(f"Mean MSE: {mean_mse_t5b_baseline}")
print("Best doc IDs:", best_doc_ids)

Processing queries...
Mean MSE: 0.5515936991289846
Best doc IDs: [2639, 1433, 1945, 504, 2100, 1801, 2631, 3638, 1732, 3606, 3084, 1491, 639, 1887, 200, 3080, 1028, 2957, 1711, 354, 3584, 2978, 3313, 2750, 1182, 644, 114, 2772, 2541, 470, 1417, 2348, 305, 1053, 430, 2307, 2655, 2530, 628, 2928, 0, 2141, 83, 249, 181, 436, 594, 557, 754, 126, 1303, 530, 1404, 2962, 1523, 1971, 372, 3191, 2969, 3503, 1438, 3521, 645, 594, 165, 1317, 3208, 622, 3502, 230, 812, 3100, 3135, 1352, 3282, 3387, 3362, 3595, 903, 3488, 2064, 639, 1935, 1319, 1475, 2628, 79, 1392, 919, 2745, 181, 319, 1208, 2778, 359, 1573, 2581, 2393, 827, 1317]


**Results**
- Score: 0.5515936991289846

- Best IDs: [2639, 1433, 1945, 504, 2100, 1801, 2631, 3638, 1732, 3606, 3084, 1491, 639, 1887, 200, 3080, 1028, 2957, 1711, 354, 3584, 2978, 3313, 2750, 1182, 644, 114, 2772, 2541, 470, 1417, 2348, 305, 1053, 430, 2307, 2655, 2530, 628, 2928, 0, 2141, 83, 249, 181, 436, 594, 557, 754, 126, 1303, 530, 1404, 2962, 1523, 1971, 372, 3191, 2969, 3503, 1438, 3521, 645, 594, 165, 1317, 3208, 622, 3502, 230, 812, 3100, 3135, 1352, 3282, 3387, 3362, 3595, 903, 3488, 2064, 639, 1935, 1319, 1475, 2628, 79, 1392, 919, 2745, 181, 319, 1208, 2778, 359, 1573, 2581, 2393, 827, 1317]



### Preprocessed

In [20]:
from transformers import T5Tokenizer, T5Model
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import sentencepiece
import torch

# Load T5 model and tokenizer
model_name = "t5-base"  # Change to "t5-base" or "t5-large" for better results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5Model.from_pretrained(model_name)

# Ensure reproducibility
random.seed(42)
np.random.seed(42)

# Function to encode text into embeddings using T5
def encode_text(text, max_length=512):
    # Tokenize and prepare inputs
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    # Pass inputs through T5 encoder to get embeddings
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    # Mean pooling of the token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Preprocess corpus to get embeddings
print("Encoding documents into embeddings...")
document_embeddings = np.array([encode_text(doc) for doc in processed_corpus])

Encoding documents into embeddings...


In [21]:
mses = []
best_doc_ids = []

# Iterate through random queries
print("Processing queries...")
for query_id in query_ids:
    query = processed_corpus[query_id]
    query_embedding = encode_text(query)
    
    # Compute cosine similarities
    doc_scores = cosine_similarity([query_embedding], document_embeddings).flatten()
    
    # Get the second-highest score
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    
    # Calculate MSE for evaluation
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

# Calculate the mean MSE
mean_mse_t5b_preprocessed = np.mean(mses)
print(f"Mean MSE: {mean_mse_t5b_preprocessed}")
print("Best doc IDs:", best_doc_ids)

Processing queries...
Mean MSE: 0.49376989779975416
Best doc IDs: [2242, 1299, 3096, 1293, 216, 1918, 1108, 958, 808, 1941, 2119, 3407, 382, 3153, 3452, 2254, 3021, 1625, 551, 959, 2201, 1369, 943, 2352, 1087, 1353, 3657, 2481, 3557, 3453, 2572, 3333, 133, 1404, 390, 165, 1850, 1142, 2160, 309, 2105, 1290, 3565, 3122, 2730, 1262, 1515, 1005, 1379, 1465, 1218, 1435, 2665, 1318, 2483, 597, 1494, 582, 3538, 3573, 1029, 1432, 1846, 2782, 478, 1592, 1970, 867, 3358, 1321, 2887, 1877, 3295, 1321, 168, 283, 2321, 560, 1028, 117, 2259, 1262, 1505, 604, 2003, 3144, 1364, 2672, 549, 2393, 1133, 1979, 1408, 271, 3114, 2393, 2977, 778, 1454, 2552]


**Results**
- Score: 0.49376989779975416

- Best IDs: [2242, 1299, 3096, 1293, 216, 1918, 1108, 958, 808, 1941, 2119, 3407, 382, 3153, 3452, 2254, 3021, 1625, 551, 959, 2201, 1369, 943, 2352, 1087, 1353, 3657, 2481, 3557, 3453, 2572, 3333, 133, 1404, 390, 165, 1850, 1142, 2160, 309, 2105, 1290, 3565, 3122, 2730, 1262, 1515, 1005, 1379, 1465, 1218, 1435, 2665, 1318, 2483, 597, 1494, 582, 3538, 3573, 1029, 1432, 1846, 2782, 478, 1592, 1970, 867, 3358, 1321, 2887, 1877, 3295, 1321, 168, 283, 2321, 560, 1028, 117, 2259, 1262, 1505, 604, 2003, 3144, 1364, 2672, 549, 2393, 1133, 1979, 1408, 271, 3114, 2393, 2977, 778, 1454, 2552]



## BERT

### Baseline

In [13]:
import numpy as np

In [14]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Encode documents into embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight BERT-based model
embeddings = model.encode(corpus)

mses = []
best_doc_ids = []

# Iterate through random queries
for query_id in query_ids:
    query = corpus[query_id]
    query_embedding = model.encode([query])
    doc_scores = cosine_similarity(query_embedding, embeddings).flatten()
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

mean_mse_bert_baseline = np.mean(mses)
print(mean_mse_bert_baseline)
print(best_doc_ids)

0.6713620455658073
[1498, 2511, 115, 3603, 1125, 2355, 916, 655, 2124, 3435, 833, 3645, 2666, 209, 367, 2218, 2166, 2120, 1759, 558, 390, 2351, 1482, 3692, 2262, 2218, 3484, 1425, 3592, 2336, 1393, 1519, 3428, 3361, 992, 1095, 655, 1482, 3551, 3383, 1503, 3695, 3242, 2534, 1416, 564, 2485, 378, 1254, 1040, 2203, 2642, 3660, 2950, 1963, 1355, 2295, 987, 1193, 1075, 1663, 2666, 393, 2299, 1013, 402, 897, 3124, 733, 67, 476, 1932, 2457, 3115, 3155, 915, 2437, 979, 2120, 2850, 2115, 2846, 2265, 3240, 1788, 3660, 2775, 1, 2547, 671, 1047, 3219, 1487, 3013, 2727, 1203, 783, 3309, 2912, 1523]


**Results**
- Score: 0.6713620455658073

- Best IDs: [1498, 2511, 115, 3603, 1125, 2355, 916, 655, 2124, 3435, 833, 3645, 2666, 209, 367, 2218, 2166, 2120, 1759, 558, 390, 2351, 1482, 3692, 2262, 2218, 3484, 1425, 3592, 2336, 1393, 1519, 3428, 3361, 992, 1095, 655, 1482, 3551, 3383, 1503, 3695, 3242, 2534, 1416, 564, 2485, 378, 1254, 1040, 2203, 2642, 3660, 2950, 1963, 1355, 2295, 987, 1193, 1075, 1663, 2666, 393, 2299, 1013, 402, 897, 3124, 733, 67, 476, 1932, 2457, 3115, 3155, 915, 2437, 979, 2120, 2850, 2115, 2846, 2265, 3240, 1788, 3660, 2775, 1, 2547, 671, 1047, 3219, 1487, 3013, 2727, 1203, 783, 3309, 2912, 1523]



### Preprocessed

In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Encode documents into embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight BERT-based model
embeddings = model.encode(processed_corpus)

mses = []
best_doc_ids = []

# Iterate through random queries
for query_id in query_ids:
    query = processed_corpus[query_id]
    query_embedding = model.encode([query])
    doc_scores = cosine_similarity(query_embedding, embeddings).flatten()
    doc_id = np.argsort(doc_scores)[-2]  # Second-highest score
    best_doc_ids.append(doc_id)
    mse = np.mean((grouped_df.iloc[query_id][required_keys] - grouped_df.iloc[doc_id][required_keys])**2)
    mses.append(mse)

mean_mse_bert_preprocessed = np.mean(mses)
print(mean_mse_bert_preprocessed)
print(best_doc_ids)

0.6309224002773224
[3625, 3712, 1641, 1547, 2495, 670, 2650, 976, 167, 5, 2436, 2607, 996, 2503, 2742, 1800, 1561, 1626, 121, 2999, 21, 1440, 927, 1257, 2357, 3584, 3657, 366, 1889, 1567, 3033, 1519, 3242, 774, 629, 2740, 1848, 2640, 458, 3699, 1288, 315, 893, 3122, 2644, 1053, 670, 3142, 3092, 3094, 10, 884, 1341, 1578, 2318, 3539, 2905, 437, 1446, 1259, 166, 3279, 566, 1214, 1257, 1852, 1918, 1389, 2880, 2686, 3556, 3147, 3625, 3429, 1117, 3420, 3477, 1073, 305, 815, 3571, 1257, 820, 1035, 1799, 3625, 45, 47, 38, 3606, 867, 2241, 996, 1551, 3013, 3279, 3029, 3387, 2092, 34]


**Results**
- Score: 0.6309224002773224

- Best IDs: [3625, 3712, 1641, 1547, 2495, 670, 2650, 976, 167, 5, 2436, 2607, 996, 2503, 2742, 1800, 1561, 1626, 121, 2999, 21, 1440, 927, 1257, 2357, 3584, 3657, 366, 1889, 1567, 3033, 1519, 3242, 774, 629, 2740, 1848, 2640, 458, 3699, 1288, 315, 893, 3122, 2644, 1053, 670, 3142, 3092, 3094, 10, 884, 1341, 1578, 2318, 3539, 2905, 437, 1446, 1259, 166, 3279, 566, 1214, 1257, 1852, 1918, 1389, 2880, 2686, 3556, 3147, 3625, 3429, 1117, 3420, 3477, 1073, 305, 815, 3571, 1257, 820, 1035, 1799, 3625, 45, 47, 38, 3606, 867, 2241, 996, 1551, 3013, 3279, 3029, 3387, 2092, 34]