# TripAdvisor Recommendation System


This notebook implements and evaluates different recommendation systems for hotel reviews.
It compares traditional information retrieval methods (BM25) with modern transformer-based approaches.
The goal is to find hotels with similar characteristics based on review text and ratings.

In [1]:
import pandas as pd
import numpy as np
import ast
from rank_bm25 import BM25Okapi
from sklearn.metrics import mean_squared_error
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
from transformers import AutoModel
import torch
from tqdm import tqdm
import time
import re
from nltk.stem import WordNetLemmatizer



  from tqdm.autonotebook import tqdm, trange


## Download NLTK resources and implement device handling

In [2]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/alexs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/alexs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/alexs/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alexs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device =  torch.device('mps') if torch.backends.mps.is_available() else device
print(f"Using device: {device}")

Using device: mps



## Data Loading and Preprocessing

1. Load TripAdvisor reviews from CSV
2. Filter reviews to ensure all required rating aspects are present
3. Group reviews by hotel (offering_id)
4. Calculate average ratings for each aspect
5. Prepare text data for analysis

In [4]:
# Load the CSV file
df = pd.read_csv('reviews.csv')

print(f"Original DataFrame shape: {df.shape}")

# Convert the 'ratings' column from string to dictionary
df['ratings'] = df['ratings'].apply(ast.literal_eval)

# Define required aspects
required_aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]

# Filter rows with at least the required aspects
df_filtered = df[df['ratings'].apply(lambda x: all(aspect in x for aspect in required_aspects))]

print(f"DataFrame shape after filtering: {df_filtered.shape}")

if df_filtered.empty:
    print("No reviews found with all required aspects. Printing unique aspects found in the dataset:")
    all_aspects = set()
    for rating in df['ratings']:
        all_aspects.update(rating.keys())
    print(sorted(all_aspects))
    data = pd.DataFrame(columns=['offering_id'] + required_aspects + ['reviews'])
else:
    # Group by offering_id
    data = df_filtered.groupby('offering_id').agg({
        'text': ' '.join,  
        'ratings': list  
    }).reset_index()

    # Calculate average ratings for each aspect
    for aspect in required_aspects:
        data[aspect] = data['ratings'].apply(lambda x: np.mean([review.get(aspect, np.nan) for review in x]))

    # Rename 'text' column to 'reviews'
    data = data.rename(columns={'text': 'reviews'})

    # Select and order the final columns
    final_columns = ['offering_id'] + required_aspects + ['reviews']
    data = data[final_columns]

print(data.head())
print("\nDataFrame shape:", data.shape)

print("\nColumn names:", data.columns.tolist())

Original DataFrame shape: (878561, 10)
DataFrame shape after filtering: (436391, 10)
   offering_id   service  cleanliness   overall     value  location  \
0        72572  4.601010     4.636364  4.388889  4.323232  4.570707   
1        72579  4.232000     4.240000  3.888000  4.152000  4.192000   
2        72586  4.250000     4.287879  4.045455  4.053030  4.537879   
3        72598  3.243243     3.243243  2.918919  3.054054  3.027027   
4        73236  4.277778     3.111111  3.388889  3.777778  4.111111   

   sleep_quality     rooms                                            reviews  
0       4.333333  4.282828  I had to make fast visit to seattle and I foun...  
1       3.768000  3.856000  Great service, rooms were clean, could use som...  
2       4.113636  3.992424  Beautiful views of the space needle - especial...  
3       3.270270  3.189189  This hotel is in need of some serious updates....  
4       3.722222  3.222222  My experience at this days inn was perfect. th...  

DataFra


### Text Preprocessing Function

Comprehensive text cleaning pipeline that:
1. Converts to lowercase
2. Removes URLs, email addresses, numbers, and special characters
3. Tokenizes text and removes stopwords
4. Removes short words and applies lemmatization
5. Removes domain-specific stopwords (hotel-related common terms)
6. Ensures consistent formatting

In [5]:
def preprocess_text(text):
    """
    Enhanced text preprocessing function with multiple cleaning steps
    """
    # Convert to lowercase and handle basic cleaning
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove numbers and special characters, keeping only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove short words (length < 4)
    tokens = [token for token in tokens if len(token) > 3]
    
    # Lemmatization (convert words to their base form)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove common hotel-related words that might not add value
    hotel_stopwords = {
        'hotel', 'room', 'stay', 'stayed', 'night', 'day', 
        'would', 'could', 'really', 'get', 'got', 'one',
        'also', 'us', 'back', 'even', 'well'
    }
    tokens = [token for token in tokens if token not in hotel_stopwords]
    
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    
    # Remove extra whitespace
    processed_text = ' '.join(processed_text.split())
    
    return processed_text

data['processed_reviews'] = data['reviews'].apply(preprocess_text)

#### Calculate word counts to understrand the impact of the preprocessing

In [6]:
# Calculate word counts
reviews_word_counts = data['reviews'].str.split().str.len()
processed_word_counts = data['processed_reviews'].str.split().str.len()

print("Reviews Statistics:")
print(f"Average words per hotel: {reviews_word_counts.mean():.2f}")
print(f"Median words per hotel: {reviews_word_counts.median():.2f}")
print(f"Min words: {reviews_word_counts.min()}")
print(f"Max words: {reviews_word_counts.max()}")
print(f"\nPercentiles:")
print(reviews_word_counts.describe())

print("\n" + "="*50 + "\n")

print("Processed Reviews Statistics:")
print(f"Average words per hotel: {processed_word_counts.mean():.2f}")
print(f"Median words per hotel: {processed_word_counts.median():.2f}")
print(f"Min words: {processed_word_counts.min()}")
print(f"Max words: {processed_word_counts.max()}")
print(f"\nPercentiles:")
print(processed_word_counts.describe())



Reviews Statistics:
Average words per hotel: 16916.49
Median words per hotel: 5509.50
Min words: 1
Max words: 294452

Percentiles:
count      3754.000000
mean      16916.485882
std       28293.752907
min           1.000000
25%        1682.750000
50%        5509.500000
75%       20139.500000
max      294452.000000
Name: reviews, dtype: float64


Processed Reviews Statistics:
Average words per hotel: 6976.72
Median words per hotel: 2232.00
Min words: 0
Max words: 124229

Percentiles:
count      3754.000000
mean       6976.724294
std       11747.965301
min           0.000000
25%         674.250000
50%        2232.000000
75%        8372.500000
max      124229.000000
Name: processed_reviews, dtype: float64


## Evaluation functions

Model Evaluation Functions:
Two main evaluation functions:
1. evaluate_model: General evaluation for transformer-based models
2. evaluate_model_bm25: Specific evaluation for BM25 models
   
Both calculate:
- Mean Squared Error (MSE) between predicted and actual ratings
- Processing time metrics
- Success rate of query evaluation

In [7]:
def evaluate_model(model, query_data, full_data):

    mse_scores = []
    aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]
    total_scoring_time = 0
    
    query_doc_ids = {idx: query['offering_id'] for idx, query in query_data.iterrows()}
    
    for idx, query in tqdm(query_data.iterrows(), total=len(query_data), desc="Evaluating queries"):
        try:
            # Time the scoring
            start_time = time.time()
            scores = model(query['processed_reviews'])
            scoring_time = time.time() - start_time
            total_scoring_time += scoring_time
            
            # Get indices of top 2 scores
            top_2_indices = np.argpartition(scores, -2)[-2:]
            top_2_indices = top_2_indices[np.argsort(scores[top_2_indices])][::-1]  
            
            # If the first best match is the query document itself, use the second best
            if full_data.iloc[top_2_indices[0]]['offering_id'] == query_doc_ids[idx]:
                best_index = top_2_indices[1]
            else:
                best_index = top_2_indices[0]
            
            # Calculate MSE
            query_ratings = query[aspects].values
            best_doc_ratings = full_data.iloc[best_index][aspects].values
            mse = mean_squared_error(query_ratings, best_doc_ratings)
            mse_scores.append(mse)
            
        except Exception as e:
            print(f"Error processing query {idx}: {str(e)}")
            continue

    # Print summary statistics
    avg_scoring_time = total_scoring_time / len(query_data) if query_data.shape[0] > 0 else 0
    print(f"\nAverage scoring time per query: {avg_scoring_time:.4f} seconds")
    print(f"Total scoring time: {total_scoring_time:.4f} seconds")
    print(f"Successfully evaluated queries: {len(mse_scores)}/{len(query_data)}")
    
    if not mse_scores:
        print("Warning: No valid evaluations were performed")
        return float('inf')
    
    avg_mse = np.mean(mse_scores)
    print(f"MSE Average: {avg_mse:.4f}")
    
    return avg_mse

In [8]:
def evaluate_model_bm25(model, query_data, full_data, column_name='processed_reviews'):

    mse_scores = []
    aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]
    total_scoring_time = 0
    
    query_doc_ids = {idx: query['offering_id'] for idx, query in query_data.iterrows()}
    
    for idx, query in tqdm(query_data.iterrows(), total=len(query_data), desc="Evaluating queries"):
        try:
            query_tokens = query[column_name].split()
            
            # Time the scoring
            start_time = time.time()
            
            scores = model.get_scores(query_tokens)
            
            scoring_time = time.time() - start_time
            total_scoring_time += scoring_time
            
            # Get indices of top 2 scores
            top_2_indices = np.argpartition(scores, -2)[-2:]
            top_2_indices = top_2_indices[np.argsort(scores[top_2_indices])][::-1]
            
            # If the first best match is the query document itself, use the second best
            if full_data.iloc[top_2_indices[0]]['offering_id'] == query_doc_ids[idx]:
                best_index = top_2_indices[1]
            else:
                best_index = top_2_indices[0]
            
            # Calculate MSE
            query_ratings = query[aspects].values
            best_doc_ratings = full_data.iloc[best_index][aspects].values
            mse = mean_squared_error(query_ratings, best_doc_ratings)
            mse_scores.append(mse)
            
        except Exception as e:
            print(f"Error processing query {idx}: {str(e)}")
            continue

    # Print summary statistics
    avg_scoring_time = total_scoring_time / len(query_data) if query_data.shape[0] > 0 else 0
    print(f"\nAverage scoring time per query: {avg_scoring_time:.4f} seconds")
    print(f"Total scoring time: {total_scoring_time:.4f} seconds")
    print(f"Successfully evaluated queries: {len(mse_scores)}/{len(query_data)}")
    
    if not mse_scores:
        print("Warning: No valid evaluations were performed")
        return float('inf')
    
    avg_mse = np.mean(mse_scores)
    print(f"MSE Average: {avg_mse:.4f}")
    
    return avg_mse

## Model Creation Functions



create_bm25_model:
- Creates traditional BM25 retrieval model
- Uses exact keyword matching with TF-IDF principles

create_hybrid_model_with_dense_retriever:
- Combines BM25 with neural embeddings
- Balances keyword matching with semantic understanding

create_mpnet_domain_model:
- Implements MPNet transformer architecture
- Optimized for domain-specific semantic similarity

create_simcse_model:
- Uses contrastive learning approach
- Generates robust sentence embeddings

create_mxbai_embed_model:
- Implements MXBAI's embedding model
- Optimized for efficient retrieval

create_mxbai_colbert_model:
- Implements MXBAI's ColBERT architecture
- Uses late interaction for precise matching


In [9]:

def create_bm25_model(corpus):
    try:
        tokenized_corpus = [doc.split(' ') for doc in corpus if isinstance(doc, str)]
        bm25 = BM25Okapi(tokenized_corpus)
        return bm25
    except Exception as e:
        print(f"Error creating BM25 model: {e}")
        return None

In [10]:

def create_hybrid_model_with_dense_retriever(corpus, alpha=0.6, beta=0.4):
    """
    Creates a hybrid model combining BM25 and SentenceTransformer with GPU support
    """
    tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
    bm25 = BM25Okapi(tokenized_corpus)
    dense_model = SentenceTransformer('all-mpnet-base-v2').to(device)
    
    # Pre-compute document embeddings
    doc_embeddings = dense_model.encode(corpus, convert_to_tensor=True, device=device)
    
    def get_hybrid_scores(query_text):
        try:
            # 1. Get BM25 scores
            query_tokens = query_text.split()
            bm25_scores = np.array(bm25.get_scores(query_tokens))
                    
            # 2. Get dense retriever scores
            query_embedding = dense_model.encode(query_text, convert_to_tensor=True, device=device)
            
            # Calculate cosine similarity on GPU
            dense_scores = torch.nn.functional.cosine_similarity(
                query_embedding.unsqueeze(0),
                doc_embeddings
            ).cpu().numpy()
              
            # Combine scores
            final_scores = alpha * bm25_scores + beta * dense_scores
            
            return final_scores
            
        except Exception as e:
            print(f"Error in hybrid scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_hybrid_scores


In [11]:
def create_mpnet_domain_model(corpus, batch_size=32):

    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Encode with special handling for reviews
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            query_embedding = query_embedding.cpu()
            
            similarities = torch.matmul(doc_embeddings, query_embedding)
            
            return similarities.numpy()
            
        except Exception as e:
            print(f"Error in scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [12]:
def create_simcse_model(corpus, batch_size=32):

    model = SentenceTransformer('princeton-nlp/unsup-simcse-bert-base-uncased').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Get embeddings with temperature scaling
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Get query embedding with same temperature scaling
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            query_embedding = query_embedding.cpu()
            similarities = torch.matmul(doc_embeddings, query_embedding)
            scores = similarities.numpy()
            
            return scores
            
        except Exception as e:
            print(f"Error in scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [13]:
def create_mxbai_embed_model(corpus, batch_size=32):

    model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Add instruction prefix for better retrieval performance
        batch_texts = [f"Represent this sentence for searching relevant passages: {text}" for text in batch_texts]
        
        # Get embeddings
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Add instruction prefix for query
            query_text = f"Represent this sentence for searching relevant passages: {query_text}"
            
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            query_embedding = query_embedding.cpu()
            similarities = torch.matmul(doc_embeddings, query_embedding)
            scores = similarities.numpy()
            
            return scores
            
        except Exception as e:
            print(f"Error in mxbai-embed scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [14]:
def create_mxbai_colbert_model(corpus, batch_size=16):

    model_name = 'mixedbread-ai/mxbai-colbert-large-v1'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Add instruction prefix for better retrieval
        batch_texts = [f"Represent this document for retrieval: {text}" for text in batch_texts]
        
        # Tokenize
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**encoded)
            attention_mask = encoded['attention_mask'].unsqueeze(-1)
            token_embeddings = outputs.last_hidden_state
            sentence_embeddings = (token_embeddings * attention_mask).sum(1) / attention_mask.sum(1)
            sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
            doc_embeddings.append(sentence_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Add instruction prefix
            query_text = f"Represent this query for retrieval: {query_text}"
            
            # Encode query
            encoded = tokenizer(
                query_text,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            ).to(device)
            
            # Get query embedding
            with torch.no_grad():
                outputs = model(**encoded)
                attention_mask = encoded['attention_mask'].unsqueeze(-1)
                token_embeddings = outputs.last_hidden_state
                query_embedding = (token_embeddings * attention_mask).sum(1) / attention_mask.sum(1)
                query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
            
            # Move to CPU and compute similarity
            query_embedding = query_embedding.cpu()
            similarities = torch.matmul(doc_embeddings, query_embedding.T)
            scores = similarities.squeeze().numpy()
            
            # Apply temperature scaling
            temperature = 0.05
            scores = np.exp(scores / temperature)
            scores = scores / np.sum(scores)
            
            return scores
            
        except Exception as e:
            print(f"Error in mxbai-colbert scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

## Model Evaluation Section

Tests each model implementation with:
- 100 random sample queries
- Both preprocessed and raw text versions for bm25
- Comprehensive performance metrics:
  * MSE (accuracy)
  * Processing time
  * Query success rate
Models evaluated:
1. BM25 (with and without preprocessing)
2. Hybrid BM25 + Dense Retriever
3. MPNet
4. SimCSE
5. MXBAI Embed
6. MXBAI ColBERT

In [15]:
# Select a subset of data for querying (e.g., 100 random samples)
query_data = data.sample(n=100, random_state=42)

In [16]:
print("Evaluating BM25 model without preprocessing...")
bm25_model = create_bm25_model(data['reviews'].tolist())
bm25_scores = evaluate_model_bm25(bm25_model, query_data, data, 'reviews')



Evaluating BM25 model without preprocessing...


Evaluating queries: 100%|██████████| 100/100 [17:35<00:00, 10.56s/it]


Average scoring time per query: 10.5526 seconds
Total scoring time: 1055.2592 seconds
Successfully evaluated queries: 100/100
MSE Average: 0.5476





In [17]:
print("Evaluating BM25 model with preprocessing...")
bm25_model = create_bm25_model(data['processed_reviews'].tolist())
bm25_scores = evaluate_model_bm25(bm25_model, query_data, data, 'processed_reviews')


Evaluating BM25 model with preprocessing...


Evaluating queries: 100%|██████████| 100/100 [06:13<00:00,  3.73s/it]


Average scoring time per query: 3.7323 seconds
Total scoring time: 373.2272 seconds
Successfully evaluated queries: 100/100
MSE Average: 0.4910





In [18]:

print("Evaluating Hybrid model...")
hybrid_model = create_hybrid_model_with_dense_retriever(data['processed_reviews'].tolist())
hybrid_scores = evaluate_model(hybrid_model, query_data, data)



Evaluating Hybrid model...


Evaluating queries: 100%|██████████| 100/100 [06:41<00:00,  4.01s/it]


Average scoring time per query: 4.0119 seconds
Total scoring time: 401.1897 seconds
Successfully evaluated queries: 100/100
MSE Average: 0.4910





In [19]:

print("Evaluating MPNet model...")
mpnet_model = create_mpnet_domain_model(data['processed_reviews'].tolist())
mpnet_scores = evaluate_model(mpnet_model, query_data, data)



Evaluating MPNet model...
Computing document embeddings...


Processing documents: 100%|██████████| 118/118 [02:02<00:00,  1.04s/it]
Evaluating queries: 100%|██████████| 100/100 [00:05<00:00, 19.21it/s]


Average scoring time per query: 0.0509 seconds
Total scoring time: 5.0927 seconds
Successfully evaluated queries: 100/100
MSE Average: 0.4637





In [20]:

print("Evaluating SimCSE model...")
simcse_model = create_simcse_model(data['processed_reviews'].tolist())
simcse_scores = evaluate_model(simcse_model, query_data, data)


Evaluating SimCSE model...


No sentence-transformers model found with name princeton-nlp/unsup-simcse-bert-base-uncased. Creating a new one with mean pooling.


pytorch_model.bin:  50%|#####     | 220M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Computing document embeddings...


Processing documents: 100%|██████████| 118/118 [02:55<00:00,  1.49s/it]
Evaluating queries: 100%|██████████| 100/100 [00:06<00:00, 15.99it/s]


Average scoring time per query: 0.0616 seconds
Total scoring time: 6.1583 seconds
Successfully evaluated queries: 100/100
MSE Average: 0.4162





In [21]:

print("Evaluating MXBAI Embed model...")
mxbai_embed_model = create_mxbai_embed_model(data['processed_reviews'].tolist())
mxbai_embed_scores = evaluate_model(mxbai_embed_model, query_data, data)


Evaluating MXBAI Embed model...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Computing document embeddings...


Processing documents: 100%|██████████| 118/118 [08:35<00:00,  4.37s/it]
Evaluating queries: 100%|██████████| 100/100 [00:13<00:00,  7.32it/s]


Average scoring time per query: 0.1343 seconds
Total scoring time: 13.4292 seconds
Successfully evaluated queries: 100/100
MSE Average: 0.5167





In [22]:

print("Evaluating MXBAI ColBERT model...")
mxbai_colbert_model = create_mxbai_colbert_model(data['processed_reviews'].tolist())
mxbai_colbert_scores = evaluate_model(mxbai_colbert_model, query_data, data)

Evaluating MXBAI ColBERT model...


tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

Computing document embeddings...


Processing documents: 100%|██████████| 235/235 [08:12<00:00,  2.09s/it]
Evaluating queries: 100%|██████████| 100/100 [00:13<00:00,  7.57it/s]


Average scoring time per query: 0.1303 seconds
Total scoring time: 13.0313 seconds
Successfully evaluated queries: 100/100
MSE Average: 0.4954



