# TripAdvisor Recommendation System


In [1]:
## 1. Import Libraries
import pandas as pd
import numpy as np
import ast
from rank_bm25 import BM25Okapi
from sklearn.metrics import mean_squared_error
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
from transformers import AutoModel
import torch
from tqdm import tqdm
import time
import re
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import CrossEncoder


In [2]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device =  torch.device('mps') if torch.backends.mps.is_available() else device
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Load the CSV file
df = pd.read_csv('reviews.csv')

print(f"Original DataFrame shape: {df.shape}")

# Convert the 'ratings' column from string to dictionary
df['ratings'] = df['ratings'].apply(ast.literal_eval)

# Define required aspects
required_aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]

# Filter rows with at least the required aspects
df_filtered = df[df['ratings'].apply(lambda x: all(aspect in x for aspect in required_aspects))]

print(f"DataFrame shape after filtering: {df_filtered.shape}")

if df_filtered.empty:
    print("No reviews found with all required aspects. Printing unique aspects found in the dataset:")
    all_aspects = set()
    for rating in df['ratings']:
        all_aspects.update(rating.keys())
    print(sorted(all_aspects))
    data = pd.DataFrame(columns=['offering_id'] + required_aspects + ['reviews'])
else:
    # Group by offering_id
    data = df_filtered.groupby('offering_id').agg({
        'text': ' '.join,  # Concatenate all reviews
        'ratings': list  # Keep all ratings
    }).reset_index()

    # Calculate average ratings for each aspect
    for aspect in required_aspects:
        data[aspect] = data['ratings'].apply(lambda x: np.mean([review.get(aspect, np.nan) for review in x]))

    # Rename 'text' column to 'reviews'
    data = data.rename(columns={'text': 'reviews'})

    # Select and order the final columns
    final_columns = ['offering_id'] + required_aspects + ['reviews']
    data = data[final_columns]

# Print the first few rows and shape of the processed data
print(data.head())
print("\nDataFrame shape:", data.shape)

# Print column names to verify
print("\nColumn names:", data.columns.tolist())

Original DataFrame shape: (878561, 10)
DataFrame shape after filtering: (436391, 10)
   offering_id   service  cleanliness   overall     value  location  \
0        72572  4.601010     4.636364  4.388889  4.323232  4.570707   
1        72579  4.232000     4.240000  3.888000  4.152000  4.192000   
2        72586  4.250000     4.287879  4.045455  4.053030  4.537879   
3        72598  3.243243     3.243243  2.918919  3.054054  3.027027   
4        73236  4.277778     3.111111  3.388889  3.777778  4.111111   

   sleep_quality     rooms                                            reviews  
0       4.333333  4.282828  I had to make fast visit to seattle and I foun...  
1       3.768000  3.856000  Great service, rooms were clean, could use som...  
2       4.113636  3.992424  Beautiful views of the space needle - especial...  
3       3.270270  3.189189  This hotel is in need of some serious updates....  
4       3.722222  3.222222  My experience at this days inn was perfect. th...  

DataFra

In [5]:
## 3. Text Preprocessing

def preprocess_text(text):
    """
    Enhanced text preprocessing function with multiple cleaning steps
    """
    # Convert to lowercase and handle basic cleaning
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove numbers and special characters, keeping only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove short words (length < 4)
    tokens = [token for token in tokens if len(token) > 3]
    
    # Lemmatization (convert words to their base form)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove common hotel-related words that might not add value
    hotel_stopwords = {
        'hotel', 'room', 'stay', 'stayed', 'night', 'day', 
        'would', 'could', 'really', 'get', 'got', 'one',
        'also', 'us', 'back', 'even', 'well'
    }
    tokens = [token for token in tokens if token not in hotel_stopwords]
    
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    
    # Remove extra whitespace
    processed_text = ' '.join(processed_text.split())
    
    return processed_text

# Apply text preprocessing to reviews
data['processed_reviews'] = data['reviews'].apply(preprocess_text)

In [7]:
# Calculate word counts
reviews_word_counts = data['reviews'].str.split().str.len()
processed_word_counts = data['processed_reviews'].str.split().str.len()

print("Reviews Statistics:")
print(f"Average words per hotel: {reviews_word_counts.mean():.2f}")
print(f"Median words per hotel: {reviews_word_counts.median():.2f}")
print(f"Min words: {reviews_word_counts.min()}")
print(f"Max words: {reviews_word_counts.max()}")
print(f"\nPercentiles:")
print(reviews_word_counts.describe())

print("\n" + "="*50 + "\n")

print("Processed Reviews Statistics:")
print(f"Average words per hotel: {processed_word_counts.mean():.2f}")
print(f"Median words per hotel: {processed_word_counts.median():.2f}")
print(f"Min words: {processed_word_counts.min()}")
print(f"Max words: {processed_word_counts.max()}")
print(f"\nPercentiles:")
print(processed_word_counts.describe())



Reviews Statistics:
Average words per hotel: 16916.49
Median words per hotel: 5509.50
Min words: 1
Max words: 294452

Percentiles:
count      3754.000000
mean      16916.485882
std       28293.752907
min           1.000000
25%        1682.750000
50%        5509.500000
75%       20139.500000
max      294452.000000
Name: reviews, dtype: float64


Processed Reviews Statistics:
Average words per hotel: 6976.72
Median words per hotel: 2232.00
Min words: 0
Max words: 124229

Percentiles:
count      3754.000000
mean       6976.722429
std       11747.961900
min           0.000000
25%         674.250000
50%        2232.000000
75%        8372.500000
max      124229.000000
Name: processed_reviews, dtype: float64


#### Evaluation function

In [28]:
def evaluate_model(model, query_data, full_data):
    mse_scores = []
    aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]
    total_scoring_time = 0
    
    for idx, query in tqdm(query_data.iterrows(), total=len(query_data), desc="Evaluating queries"):
        try:
            # Time the scoring
            start_time = time.time()
            scores = model(query['processed_reviews'])
            scoring_time = time.time() - start_time
            total_scoring_time += scoring_time
            
            # Find best matching document
            scores[full_data['offering_id'] == query['offering_id']] = 0  # Exclude the query document
            best_index = np.argmax(scores)
            
            # Calculate MSE
            query_ratings = query[aspects].values
            best_doc_ratings = full_data.iloc[best_index][aspects].values
            mse = mean_squared_error(query_ratings, best_doc_ratings)
            mse_scores.append(mse)
            
        except Exception as e:
            print(f"Error processing query {idx}")
            print(f"Error details: {str(e)}")
            print(f"Query text sample: {str(query['processed_review'])[:100]}")
            continue

    # Print summary statistics
    avg_scoring_time = total_scoring_time / len(query_data) if query_data.shape[0] > 0 else 0
    print(f"\nAverage scoring time per query: {avg_scoring_time:.4f} seconds")
    print(f"Total scoring time: {total_scoring_time:.4f} seconds")
    print(f"Successfully evaluated queries: {len(mse_scores)}/{len(query_data)}")
    
    if not mse_scores:
        print("Warning: No valid evaluations were performed")
        return float('inf')
    
    avg_mse = np.mean(mse_scores)
    print(f"MSE Average: {avg_mse:.4f}")
    
    return avg_mse


In [9]:
def evaluate_model_bm25(model, query_data, full_data, column_name='processed_reviews'):
    mse_scores = []
    aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]
    total_scoring_time = 0
    
    for idx, query in tqdm(query_data.iterrows(), total=len(query_data), desc="Evaluating queries"):
        try:
            # Ensure the query text is properly formatted
            query_text = query[column_name]
            
            # Split the query text into tokens
            query_tokens = query_text.split(' ')
            
            # Time the scoring
            start_time = time.time()
            
            # Get scores
            try:
                scores = model.get_scores(query_tokens)
            except Exception as e:
                print(f"Error during scoring for query {idx}: {str(e)}")
                print(f"Query tokens: {query_tokens[:10]}...")  # Print first 10 tokens
                continue
                
            scoring_time = time.time() - start_time
            total_scoring_time += scoring_time
            
            # Find best matching document
            scores[full_data['offering_id'] == query['offering_id']] = -float('inf')  # Exclude the query document
            best_index = np.argmax(scores)
            
            # Calculate MSE
            query_ratings = query[aspects].values
            best_doc_ratings = full_data.iloc[best_index][aspects].values
            mse = mean_squared_error(query_ratings, best_doc_ratings)
            mse_scores.append(mse)
            

                
        except Exception as e:
            print(f"Error processing query {idx}")
            print(f"Error details: {str(e)}")
            print(f"Query text sample: {str(query['processed_review'])[:100]}")
            continue

    # Print summary statistics
    avg_scoring_time = total_scoring_time / len(query_data) if query_data.shape[0] > 0 else 0
    print(f"\nAverage scoring time per query: {avg_scoring_time:.4f} seconds")
    print(f"Total scoring time: {total_scoring_time:.4f} seconds")
    print(f"Successfully evaluated queries: {len(mse_scores)}/{len(query_data)}")
    
    if not mse_scores:
        print("Warning: No valid evaluations were performed")
        return float('inf')
    
    avg_mse = np.mean(mse_scores)
    print(f"MSE Average: {avg_mse:.4f}")
    
    return avg_mse

#### Model Definition and eval function



In [10]:

def create_bm25_model(corpus):
    try:
        tokenized_corpus = [doc.split(' ') for doc in corpus if isinstance(doc, str)]
        bm25 = BM25Okapi(tokenized_corpus)
        return bm25
    except Exception as e:
        print(f"Error creating BM25 model: {e}")
        return None

In [11]:
def create_hybrid_model_with_dense_retriever(corpus, alpha=0.6, beta=0.4):
    """
    Creates a hybrid model combining BM25 and SentenceTransformer with GPU support
    """
    # Initialize models
    tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
    bm25 = BM25Okapi(tokenized_corpus)
    
    # Use a model specifically trained for semantic similarity
    dense_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1').to(device)
    
    # Pre-compute document embeddings with normalization
    doc_embeddings = dense_model.encode(
        corpus,
        convert_to_tensor=True,
        device=device,
        normalize_embeddings=True,  # Important for proper similarity calculation
        batch_size=32,
        show_progress_bar=True
    )
    
    def get_hybrid_scores(query_text):
        try:
            # 1. Get BM25 scores
            query_tokens = query_text.split()
            bm25_scores = np.array(bm25.get_scores(query_tokens))
            
            # Normalize BM25 scores to [0,1] range
            if bm25_scores.max() != bm25_scores.min():
                bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min())
            
            # 2. Get dense retriever scores
            query_embedding = dense_model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            # Calculate similarity scores
            dense_scores = torch.matmul(doc_embeddings, query_embedding).cpu().numpy()
            
            # Normalize dense scores to [0,1] range
            if dense_scores.max() != dense_scores.min():
                dense_scores = (dense_scores - dense_scores.min()) / (dense_scores.max() - dense_scores.min())
            
            # Combine scores with weights
            final_scores = alpha * bm25_scores + beta * dense_scores
            
            return final_scores
            
        except Exception as e:
            print(f"Error in hybrid scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_hybrid_scores

In [12]:
def create_colbert_model(corpus, max_length=128):
    """
    Creates an improved ColBERT model with better token-level interaction
    """
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    
    # Pre-compute document embeddings with attention mask
    doc_embeddings = []
    
    for doc in tqdm(corpus, desc="Computing document embeddings"):
        tokens = tokenizer(
            doc,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        tokens = {k: v.to(device) for k, v in tokens.items()}
        
        with torch.no_grad():
            outputs = model(**tokens)
            # Use attention mask to get valid token embeddings
            mask = tokens['attention_mask'].unsqueeze(-1)
            embeddings = outputs.last_hidden_state * mask
            # Normalize embeddings
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
            doc_embeddings.append(embeddings.cpu())
    
    def get_scores(query_text):
        try:
            query_tokens = tokenizer(
                query_text,
                padding='max_length',
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)
            
            with torch.no_grad():
                query_outputs = model(**query_tokens)
                query_mask = query_tokens['attention_mask'].unsqueeze(-1)
                query_embeddings = query_outputs.last_hidden_state * query_mask
                query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=-1)
            
            scores = []
            query_embeddings = query_embeddings.cpu()
            
            for doc_emb in doc_embeddings:
                # MaxSim operation with proper masking
                sim_matrix = torch.matmul(query_embeddings.squeeze(), doc_emb.squeeze().transpose(-1, -2))
                score = sim_matrix.max(dim=1)[0].mean().item()
                scores.append(score)
            
            return np.array(scores)
            
        except Exception as e:
            print(f"Error in ColBERT scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [13]:
def create_bert_biencoder_model(corpus, batch_size=32):
    """
    Creates an improved bi-encoder model with better batch processing
    and normalization
    """
    model_name = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    
    # Pre-compute document embeddings with proper pooling
    doc_embeddings_list = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Computing document embeddings"):
        batch = corpus[i:i + batch_size]
        
        encoded = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors='pt'
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**encoded)
            # Mean pooling with attention mask
            attention_mask = encoded['attention_mask'].unsqueeze(-1)
            token_embeddings = outputs.last_hidden_state * attention_mask
            sentence_embeddings = token_embeddings.sum(1) / attention_mask.sum(1)
            # Normalize embeddings
            sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
            doc_embeddings_list.append(sentence_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings_list, dim=0)
    
    def get_scores(query_text):
        try:
            query_encoded = tokenizer(
                query_text,
                padding=True,
                truncation=True,
                max_length=256,
                return_tensors='pt'
            ).to(device)
            
            with torch.no_grad():
                query_outputs = model(**query_encoded)
                # Mean pooling for query
                attention_mask = query_encoded['attention_mask'].unsqueeze(-1)
                token_embeddings = query_outputs.last_hidden_state * attention_mask
                query_embedding = token_embeddings.sum(1) / attention_mask.sum(1)
                query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
            
            # Compute similarity scores
            similarities = torch.matmul(doc_embeddings, query_embedding.cpu().transpose(0, 1))
            
            return similarities.squeeze().numpy()
            
        except Exception as e:
            print(f"Error in bi-encoder scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [14]:
def create_dual_encoder_model(corpus, batch_size=32):
    """
    Creates a dual encoder model using pre-trained sentence transformers
    with improved semantic matching capabilities
    
    Args:
        corpus: List of text documents
        batch_size: Batch size for processing documents
    Returns:
        scoring function that computes similarity between query and documents
    """
        # Initialize model with a strong semantic search model
    model = SentenceTransformer('multi-qa-mpnet-base-dot-v1').to(device)
    
    print("Computing document embeddings...")
    # Pre-compute document embeddings with proper batching and normalization
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Encode batch
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,  # Important for cosine similarity
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    # Concatenate all batches
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Encode query with same normalization
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            # Move query embedding to same device as documents
            query_embedding = query_embedding.cpu()
            
            # Compute dot product similarity
            # Using dot product since embeddings are normalized
            similarities = torch.matmul(doc_embeddings, query_embedding)
            
            # Convert to numpy array
            scores = similarities.numpy()
            
            # Ensure scores are properly scaled
            scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
            
            return scores
            
        except Exception as e:
            print(f"Error in dual encoder scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores
    


In [15]:

def create_hybrid_model_with_dense_retriever(corpus, alpha=0.85, beta=0.15):
    """
    Creates a hybrid model combining BM25 and SentenceTransformer with GPU support
    """
    # Initialize models
    tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
    bm25 = BM25Okapi(tokenized_corpus)
    dense_model = SentenceTransformer('all-mpnet-base-v2').to(device)
    
    # Pre-compute document embeddings
    doc_embeddings = dense_model.encode(corpus, convert_to_tensor=True, device=device)
    
    def get_hybrid_scores(query_text):
        try:
            # 1. Get BM25 scores
            query_tokens = query_text.split()
            bm25_scores = np.array(bm25.get_scores(query_tokens))
                    
            # 2. Get dense retriever scores
            query_embedding = dense_model.encode(query_text, convert_to_tensor=True, device=device)
            
            # Calculate cosine similarity on GPU
            dense_scores = torch.nn.functional.cosine_similarity(
                query_embedding.unsqueeze(0),
                doc_embeddings
            ).cpu().numpy()
              
            # Combine scores
            final_scores = alpha * bm25_scores + beta * dense_scores
            
            return final_scores
            
        except Exception as e:
            print(f"Error in hybrid scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_hybrid_scores


In [16]:

def create_dual_encoder_model(corpus, batch_size=32):
    """
    Creates a dual encoder model using pre-trained sentence transformers
    with explicit GPU handling
    """
    model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = model.encode(
        corpus,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_tensor=True,
        device=device
    )
    
    # Normalize embeddings on GPU
    doc_embeddings = torch.nn.functional.normalize(doc_embeddings, p=2, dim=1)
    
    def get_scores(query_text):
        try:
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                show_progress_bar=False,
                device=device
            )
            
            query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=0)
            
            # Calculate similarities on GPU
            similarities = torch.matmul(doc_embeddings, query_embedding)
            
            return similarities.cpu().numpy()
            
        except Exception as e:
            print(f"Error in dual encoder scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [17]:
def create_topic_aware_semantic_model(corpus, n_topics=50, batch_size=32):
    """
    Creates a model that combines topic modeling with semantic embeddings
    to capture both thematic and semantic similarity
    """

    
    # Initialize models
    count_vec = CountVectorizer(max_features=5000, stop_words='english')
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    semantic_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
    
    # Fit topic model
    print("Computing topic distributions...")
    doc_term_matrix = count_vec.fit_transform(corpus)
    topic_distributions = lda.fit_transform(doc_term_matrix)
    
    # Pre-compute semantic embeddings
    print("Computing semantic embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Get semantic embeddings
        batch_embeddings = semantic_model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        
        # Get topic distributions for this batch
        batch_topics = topic_distributions[i:i + len(batch_texts)]
        
        # Combine semantic and topic information
        enhanced_embeddings = []
        for j, emb in enumerate(batch_embeddings):
            # Get topic distribution
            topic_dist = batch_topics[j]
            topic_entropy = -np.sum(topic_dist * np.log2(topic_dist + 1e-10))
            
            # Weight embedding based on topic clarity
            topic_weight = 1 + (1 - topic_entropy/np.log2(n_topics))
            enhanced_emb = emb * topic_weight
            enhanced_embeddings.append(enhanced_emb)
        
        # Stack and normalize
        batch_enhanced = torch.stack(enhanced_embeddings)
        batch_enhanced = torch.nn.functional.normalize(batch_enhanced, p=2, dim=1)
        doc_embeddings.append(batch_enhanced.cpu())
    
    # Concatenate all batches
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Get semantic embedding for query
            query_embedding = semantic_model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            # Get topic distribution for query
            query_bow = count_vec.transform([query_text])
            query_topics = lda.transform(query_bow)[0]
            
            # Weight query embedding based on topic clarity
            topic_entropy = -np.sum(query_topics * np.log2(query_topics + 1e-10))
            topic_weight = 1 + (1 - topic_entropy/np.log2(n_topics))
            query_embedding = query_embedding * topic_weight
            
            # Normalize and compute similarity
            query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=0)
            similarities = torch.matmul(doc_embeddings, query_embedding.cpu())
            scores = similarities.numpy()
            
            return scores
            
        except Exception as e:
            print(f"Error in topic-aware scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [18]:
def create_mpnet_domain_model(corpus, batch_size=32):
    """
    Creates a model using MPNet with domain adaptation for review similarity
    Uses all-mpnet-base-v2 which is the best performing general purpose model
    according to Sentence-Transformers documentation
    """
    # Initialize model with the best performing general purpose model
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Encode with special handling for reviews
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    # Concatenate all batches
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            # Move query embedding to CPU
            query_embedding = query_embedding.cpu()
            
            # Compute similarity using dot product (since vectors are normalized)
            similarities = torch.matmul(doc_embeddings, query_embedding)
            
            return similarities.numpy()
            
        except Exception as e:
            print(f"Error in scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [19]:
def create_multiqa_mpnet_model(corpus, batch_size=32):
    """
    Uses multi-qa-mpnet-base-dot-v1 which achieves 57.60 performance on semantic search
    according to Sentence-Transformers documentation
    """
    # Initialize with the best performing semantic search model
    model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            query_embedding = query_embedding.cpu()
            similarities = torch.matmul(doc_embeddings, query_embedding)
            
            # Scale scores to [0,1]
            scores = similarities.numpy()
            scores = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8)
            
            return scores
            
        except Exception as e:
            print(f"Error in scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [20]:
def create_simcse_model(corpus, batch_size=32):
    """
    Creates a model using unsupervised SimCSE which has shown SOTA performance
    on semantic similarity tasks
    """
    # Initialize with unsupervised SimCSE model
    model = SentenceTransformer('princeton-nlp/unsup-simcse-bert-base-uncased').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Get embeddings with temperature scaling
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Get query embedding with same temperature scaling
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            query_embedding = query_embedding.cpu()
            similarities = torch.matmul(doc_embeddings, query_embedding)
            scores = similarities.numpy()
            
            return scores
            
        except Exception as e:
            print(f"Error in scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [21]:
def create_bge_v15_model(corpus, batch_size=32):
    """
    Creates a model using bge-base-en-v1.5, one of the strongest embedding models
    currently available
    """
    # Initialize with BGE-M3 model
    model = SentenceTransformer('BAAI/bge-large-en-v1.5').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Add instruction prefix for better retrieval performance
        batch_texts = [f"Represent this hotel review for retrieval: {text}" for text in batch_texts]
        
        # Get embeddings
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Add instruction prefix for query
            query_text = f"Represent this hotel review for retrieval: {query_text}"
            
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            query_embedding = query_embedding.cpu()
            similarities = torch.matmul(doc_embeddings, query_embedding)
            scores = similarities.numpy()
            
            return scores
            
        except Exception as e:
            print(f"Error in BGE-M3 scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [22]:
def create_uae_model(corpus, batch_size=16):
    """
    Creates a model using WhereIsAI/UAE-Large-V1, a powerful embedding model
    specifically designed for universal text embeddings
    """
    # Initialize UAE model
    model = SentenceTransformer('WhereIsAI/UAE-Large-V1').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Get embeddings with special handling for long texts
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Get query embedding
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            query_embedding = query_embedding.cpu()
            
            # Compute similarity using dot product (vectors are normalized)
            similarities = torch.matmul(doc_embeddings, query_embedding)
            scores = similarities.numpy()
            
            # Apply temperature scaling for better discrimination
            temperature = 0.05
            scores = np.exp(scores / temperature)
            scores = scores / np.sum(scores)
            
            return scores
            
        except Exception as e:
            print(f"Error in UAE scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [23]:
def create_mxbai_embed_model(corpus, batch_size=32):
    """
    Creates a model using mixedbread-ai/mxbai-embed-large-v1, which achieves SOTA 
    performance on MTEB benchmark
    """
    # Initialize model
    model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1').to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Add instruction prefix for better retrieval performance
        batch_texts = [f"Represent this sentence for searching relevant passages: {text}" for text in batch_texts]
        
        # Get embeddings
        batch_embeddings = model.encode(
            batch_texts,
            convert_to_tensor=True,
            device=device,
            normalize_embeddings=True,
            batch_size=batch_size,
            show_progress_bar=False
        )
        doc_embeddings.append(batch_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Add instruction prefix for query
            query_text = f"Represent this sentence for searching relevant passages: {query_text}"
            
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True
            )
            
            query_embedding = query_embedding.cpu()
            similarities = torch.matmul(doc_embeddings, query_embedding)
            scores = similarities.numpy()
            
            return scores
            
        except Exception as e:
            print(f"Error in mxbai-embed scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [24]:
def create_mxbai_colbert_model(corpus, batch_size=16):
    """
    Creates a model using mixedbread-ai/mxbai-colbert-large-v1 using direct 
    sentence-transformers implementation with proper pooling
    """
    # Initialize model and tokenizer
    model_name = 'mixedbread-ai/mxbai-colbert-large-v1'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    
    print("Computing document embeddings...")
    doc_embeddings = []
    
    for i in tqdm(range(0, len(corpus), batch_size), desc="Processing documents"):
        batch_texts = corpus[i:i + batch_size]
        
        # Add instruction prefix for better retrieval
        batch_texts = [f"Represent this document for retrieval: {text}" for text in batch_texts]
        
        # Tokenize
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**encoded)
            # Mean pooling
            attention_mask = encoded['attention_mask'].unsqueeze(-1)
            token_embeddings = outputs.last_hidden_state
            sentence_embeddings = (token_embeddings * attention_mask).sum(1) / attention_mask.sum(1)
            # Normalize
            sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
            doc_embeddings.append(sentence_embeddings.cpu())
    
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Add instruction prefix
            query_text = f"Represent this query for retrieval: {query_text}"
            
            # Encode query
            encoded = tokenizer(
                query_text,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            ).to(device)
            
            # Get query embedding
            with torch.no_grad():
                outputs = model(**encoded)
                attention_mask = encoded['attention_mask'].unsqueeze(-1)
                token_embeddings = outputs.last_hidden_state
                query_embedding = (token_embeddings * attention_mask).sum(1) / attention_mask.sum(1)
                query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
            
            # Move to CPU and compute similarity
            query_embedding = query_embedding.cpu()
            similarities = torch.matmul(doc_embeddings, query_embedding.T)
            scores = similarities.squeeze().numpy()
            
            # Apply temperature scaling
            temperature = 0.05
            scores = np.exp(scores / temperature)
            scores = scores / np.sum(scores)
            
            return scores
            
        except Exception as e:
            print(f"Error in mxbai-colbert scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

#### Evaluate Models


In [25]:
# Select a subset of data for querying (e.g., 100 random samples)
query_data = data.sample(n=100, random_state=42)

In [26]:
hybrid_model = create_hybrid_model_with_dense_retriever(data['processed_reviews'].tolist())


.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/436M [00:00<?, ?B/s]

model_O1.onnx:   0%|          | 0.00/436M [00:00<?, ?B/s]

model_O2.onnx:   0%|          | 0.00/436M [00:00<?, ?B/s]

model_O3.onnx:   0%|          | 0.00/436M [00:00<?, ?B/s]

model_O4.onnx:   0%|          | 0.00/218M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/110M [00:00<?, ?B/s]

model_qint8_avx512.onnx:   0%|          | 0.00/110M [00:00<?, ?B/s]

model_qint8_avx512_vnni.onnx:   0%|          | 0.00/110M [00:00<?, ?B/s]

model_quint8_avx2.onnx:   0%|          | 0.00/110M [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

openvino/openvino_model.xml:   0%|          | 0.00/433k [00:00<?, ?B/s]

openvino_model_qint8_quantized.bin:   0%|          | 0.00/110M [00:00<?, ?B/s]

(…)nvino/openvino_model_qint8_quantized.xml:   0%|          | 0.00/742k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [29]:
hybrid_scores = evaluate_model(hybrid_model, query_data, data)


Evaluating queries: 100%|██████████| 100/100 [21:24<00:00, 12.85s/it]


Average scoring time per query: 12.8443 seconds
Total scoring time: 1284.4333 seconds
Successfully evaluated queries: 100/100
MSE Average: 0.4910



