# TripAdvisor Recommendation System


In [4]:
## 1. Import Libraries
import pandas as pd
import numpy as np
import ast
from rank_bm25 import BM25Okapi
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModel
from torch.nn.functional import softmax
import torch
from sentence_transformers import SentenceTransformer



In [5]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Load the CSV file
df = pd.read_csv('reviews.csv')

print(f"Original DataFrame shape: {df.shape}")

# Convert the 'ratings' column from string to dictionary
df['ratings'] = df['ratings'].apply(ast.literal_eval)

# Define required aspects
required_aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]

# Filter rows with at least the required aspects
df_filtered = df[df['ratings'].apply(lambda x: all(aspect in x for aspect in required_aspects))]

print(f"DataFrame shape after filtering: {df_filtered.shape}")

if df_filtered.empty:
    print("No reviews found with all required aspects. Printing unique aspects found in the dataset:")
    all_aspects = set()
    for rating in df['ratings']:
        all_aspects.update(rating.keys())
    print(sorted(all_aspects))
    data = pd.DataFrame(columns=['offering_id'] + required_aspects + ['reviews'])
else:
    # Group by offering_id
    data = df_filtered.groupby('offering_id').agg({
        'text': ' '.join,  # Concatenate all reviews
        'ratings': list  # Keep all ratings
    }).reset_index()

    # Calculate average ratings for each aspect
    for aspect in required_aspects:
        data[aspect] = data['ratings'].apply(lambda x: np.mean([review.get(aspect, np.nan) for review in x]))

    # Rename 'text' column to 'reviews'
    data = data.rename(columns={'text': 'reviews'})

    # Select and order the final columns
    final_columns = ['offering_id'] + required_aspects + ['reviews']
    data = data[final_columns]

# Print the first few rows and shape of the processed data
print(data.head())
print("\nDataFrame shape:", data.shape)

# Print column names to verify
print("\nColumn names:", data.columns.tolist())

Original DataFrame shape: (878561, 10)
DataFrame shape after filtering: (436391, 10)
   offering_id   service  cleanliness   overall     value  location  \
0        72572  4.601010     4.636364  4.388889  4.323232  4.570707   
1        72579  4.232000     4.240000  3.888000  4.152000  4.192000   
2        72586  4.250000     4.287879  4.045455  4.053030  4.537879   
3        72598  3.243243     3.243243  2.918919  3.054054  3.027027   
4        73236  4.277778     3.111111  3.388889  3.777778  4.111111   

   sleep_quality     rooms                                            reviews  
0       4.333333  4.282828  I had to make fast visit to seattle and I foun...  
1       3.768000  3.856000  Great service, rooms were clean, could use som...  
2       4.113636  3.992424  Beautiful views of the space needle - especial...  
3       3.270270  3.189189  This hotel is in need of some serious updates....  
4       3.722222  3.222222  My experience at this days inn was perfect. th...  

DataFra

In [7]:
## 3. Text Preprocessing

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    
    return ' '.join(tokens)

# Apply text preprocessing to reviews
data['processed_review'] = data['reviews'].apply(preprocess_text)



#### Evaluation function

In [8]:
def evaluate_model(model, query_data, full_data, model_type='hybrid', k=5):
    """
    Improved evaluation function for recommendation models
    Args:
        model: The model to evaluate (BM25 or hybrid)
        query_data: DataFrame containing query samples
        full_data: Complete DataFrame with all hotels
        model_type: 'bm25' or 'hybrid'
        k: Number of similar hotels to consider
    Returns:
        float: Average MSE across all queries
    """
    mse_scores = []
    aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]
    
    for idx, query in query_data.iterrows():
        try:
            # Get similarity scores based on model type
            if model_type == 'bm25':
                tokenized_query = query['processed_review'].split()
                scores = model.get_scores(tokenized_query)
            else:  # hybrid model
                scores = model(query['processed_review'])
            
            # Create mask to exclude the query hotel itself
            mask = np.ones(len(scores), dtype=bool)
            query_hotel_id = query['offering_id']
            mask[full_data['offering_id'] == query_hotel_id] = False
            
            # Apply mask and get top-k indices
            masked_scores = scores[mask]
            masked_indices = np.argsort(masked_scores)[-k:]
            
            # Map masked indices back to original indices
            top_k_idx = np.where(mask)[0][masked_indices]
            
            # Calculate ratings
            similar_ratings = []
            query_ratings = []
            
            for aspect in aspects:
                # Get average rating for similar hotels
                avg_rating = full_data.iloc[top_k_idx][aspect].mean()
                similar_ratings.append(avg_rating)
                query_ratings.append(query[aspect])
            
            # Calculate MSE
            mse = mean_squared_error(query_ratings, similar_ratings)
            mse_scores.append(mse)
            
        except Exception as e:
            print(f"Error processing query {idx}: {e}")
            continue
    
    # Return average MSE
    if not mse_scores:
        print("Warning: No valid evaluations were performed")
        return float('inf')
    
    avg_mse = np.mean(mse_scores)
    std_mse = np.std(mse_scores)
    print(f"Number of evaluated queries: {len(mse_scores)}")
    print(f"MSE Average for {model_type} model: {avg_mse:.4f}")
    print(f"MSE Standard Deviation for {model_type} model: {std_mse:.4f}")
    
    return avg_mse

#### Model Definition and eval function



In [9]:

def create_bm25_model(corpus):
    try:
        tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
        bm25 = BM25Okapi(tokenized_corpus)
        return bm25
    except Exception as e:
        print(f"Error creating BM25 model: {e}")
        return None

In [10]:
def create_hybrid_model_with_dense_retriever(corpus, alpha=0.85, beta=0.15):
    """
    Creates a hybrid model combining BM25 and SentenceTransformer (all-mpnet-base-v2)
    with proper tensor handling for Apple Silicon
    """
    # Initialize models
    tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
    bm25 = BM25Okapi(tokenized_corpus)
    dense_model = SentenceTransformer('all-mpnet-base-v2')
    
    # Pre-compute document embeddings and move to CPU
    doc_embeddings = dense_model.encode(corpus, convert_to_tensor=True)
    doc_embeddings = doc_embeddings.cpu()  # Move to CPU
    
    def get_hybrid_scores(query_text):
        try:
            # 1. Get BM25 scores
            query_tokens = query_text.split()
            bm25_scores = np.array(bm25.get_scores(query_tokens))
                    
            # 2. Get dense retriever scores
            query_embedding = dense_model.encode(query_text, convert_to_tensor=True)
            query_embedding = query_embedding.cpu()  # Move to CPU
            
            # Reshape embeddings for cosine similarity
            query_embedding_reshaped = query_embedding.reshape(1, -1)
            doc_embeddings_reshaped = doc_embeddings.reshape(len(corpus), -1)
            
            # Calculate cosine similarity
            dense_scores = cosine_similarity(query_embedding_reshaped, doc_embeddings_reshaped)[0]
              
            # Combine scores
            final_scores = alpha * bm25_scores + beta * dense_scores
            
            return final_scores
            
        except Exception as e:
            print(f"Error in hybrid scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_hybrid_scores

In [11]:


def create_cross_encoder_hybrid_model(corpus, top_k=100):
    """
    Creates a hybrid model using BM25 for initial retrieval and Cross-Encoder for re-ranking
    """
    # Initialize BM25
    tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
    bm25 = BM25Okapi(tokenized_corpus)
    
    # Initialize Cross-Encoder
    model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    cross_encoder = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    def get_hybrid_scores(query_text):
        try:
            # 1. Get initial candidates using BM25
            query_tokens = query_text.split()
            bm25_scores = np.array(bm25.get_scores(query_tokens))
            top_k_idx = np.argsort(bm25_scores)[-top_k:]
            
            # 2. Prepare pairs for cross-encoder
            pairs = []
            for idx in top_k_idx:
                pairs.append([query_text, corpus[idx]])
            
            # 3. Cross-encoder scoring
            features = tokenizer(
                pairs,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=512
            )
            
            with torch.no_grad():
                scores = cross_encoder(**features)
                scores = torch.sigmoid(scores.logits).squeeze().numpy()
            
            # 4. Create final scores array
            final_scores = np.zeros(len(corpus))
            final_scores[top_k_idx] = scores
            
            return final_scores
            
        except Exception as e:
            print(f"Error in hybrid scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_hybrid_scores


In [12]:

def create_colbert_model(corpus, max_length=128):
    """
    Creates a ColBERT-style model that performs better than standard dense retrievers
    by using contextual late interaction
    """
    # Initialize BERT model and tokenizer
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Pre-compute document embeddings
    doc_embeddings = []
    
    for doc in corpus:
        # Tokenize document
        tokens = tokenizer(
            doc,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        # Get BERT embeddings
        with torch.no_grad():
            outputs = model(**tokens)
            embeddings = outputs.last_hidden_state.squeeze()  # [seq_len, hidden_dim]
            doc_embeddings.append(embeddings)
    
    def get_scores(query_text):
        try:
            # Tokenize query
            query_tokens = tokenizer(
                query_text,
                padding='max_length',
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            )
            
            # Get query embeddings
            with torch.no_grad():
                query_outputs = model(**query_tokens)
                query_embeddings = query_outputs.last_hidden_state.squeeze()  # [seq_len, hidden_dim]
            
            # Calculate MaxSim scores for each document
            scores = []
            for doc_emb in doc_embeddings:
                # Calculate similarity matrix between query and document tokens
                sim_matrix = torch.matmul(query_embeddings, doc_emb.T)  # [query_len, doc_len]
                
                # Max-pool over document dimension
                max_sim = torch.max(sim_matrix, dim=1)[0]  # [query_len]
                
                # Sum over query tokens (with optional masking of padding)
                score = torch.mean(max_sim).item()
                scores.append(score)
            
            return np.array(scores)
            
        except Exception as e:
            print(f"Error in ColBERT scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [13]:
def create_bert_biencoder_model(corpus, batch_size=32):
    """
    Creates a bi-encoder model using BERT embeddings for efficient semantic search
    
    Args:
        corpus: List of text documents
        batch_size: Batch size for processing documents
    Returns:
        scoring function that computes similarity between query and documents
    """
    # Initialize BERT model and tokenizer
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'  # Lightweight but effective model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Pre-compute document embeddings in batches
    doc_embeddings = []
    
    for i in range(0, len(corpus), batch_size):
        batch = corpus[i:i + batch_size]
        
        # Tokenize and encode batch
        encoded = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors='pt'
        )
        
        # Get BERT embeddings
        with torch.no_grad():
            outputs = model(**encoded)
            # Use [CLS] token embedding as document representation
            batch_embeddings = outputs.last_hidden_state[:, 0, :]
            doc_embeddings.append(batch_embeddings)
    
    # Concatenate all batches
    doc_embeddings = torch.cat(doc_embeddings, dim=0)
    
    def get_scores(query_text):
        try:
            # Encode query
            query_encoded = tokenizer(
                query_text,
                padding=True,
                truncation=True,
                max_length=256,
                return_tensors='pt'
            )
            
            # Get query embedding
            with torch.no_grad():
                query_outputs = model(**query_encoded)
                query_embedding = query_outputs.last_hidden_state[:, 0, :]  # [1, hidden_dim]
            
            # Calculate cosine similarity with all documents
            similarities = torch.nn.functional.cosine_similarity(
                query_embedding.unsqueeze(0),  # [1, 1, hidden_dim]
                doc_embeddings.unsqueeze(1),   # [num_docs, 1, hidden_dim]
                dim=2
            )
            
            return similarities.squeeze().numpy()
            
        except Exception as e:
            print(f"Error in bi-encoder scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

In [14]:
def create_cross_encoder_hybrid_model(corpus, top_k=50):
    """
    Creates a hybrid model using BM25 for initial retrieval and Cross-Encoder for re-ranking
    with memory optimization
    """
    # Initialize BM25
    tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
    bm25 = BM25Okapi(tokenized_corpus)
    
    # Initialize Cross-Encoder
    model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    cross_encoder = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    def get_hybrid_scores(query_text):
        try:
            # 1. Get initial candidates using BM25
            query_tokens = query_text.split()
            bm25_scores = np.array(bm25.get_scores(query_tokens))
            top_k_idx = np.argsort(bm25_scores)[-top_k:]
            
            # 2. Process in smaller batches
            batch_size = 10
            final_scores = np.zeros(len(corpus))
            
            for i in range(0, len(top_k_idx), batch_size):
                batch_idx = top_k_idx[i:i + batch_size]
                pairs = [(query_text, corpus[idx]) for idx in batch_idx]
                
                features = tokenizer.batch_encode_plus(
                    pairs,
                    max_length=256,
                    padding=True,
                    truncation=True,
                    return_tensors="pt"
                )
                
                with torch.no_grad():
                    outputs = cross_encoder(**features)
                    # Handle both single-label and multi-label cases
                    if outputs.logits.shape[1] == 1:
                        # For single-label case
                        scores = torch.sigmoid(outputs.logits).squeeze(-1).numpy()
                    else:
                        # For multi-label case
                        scores = softmax(outputs.logits, dim=1)[:, 1].numpy()
                
                final_scores[batch_idx] = scores
                
                # Clear CUDA cache if using GPU
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            
            return final_scores
            
        except Exception as e:
            print(f"Error in hybrid scoring: {e}")
            print(f"Shape of logits: {outputs.logits.shape if 'outputs' in locals() else 'unknown'}")
            return np.zeros(len(corpus))
    
    return get_hybrid_scores



In [15]:
def create_dual_encoder_model(corpus, batch_size=32):
    """
    Creates a dual encoder model using pre-trained sentence transformers
    with mean pooling and attention mechanism for better semantic matching
    
    Args:
        corpus: List of text documents
        batch_size: Batch size for processing documents
    Returns:
        scoring function that computes similarity between query and documents
    """
    # Initialize model - using a different pre-trained model specialized for semantic search
    model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
    
    # Pre-compute document embeddings in batches with progress bar
    print("Computing document embeddings...")
    doc_embeddings = model.encode(
        corpus,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_tensor=True
    )
    
    # Move embeddings to CPU and normalize
    doc_embeddings = doc_embeddings.cpu()
    doc_embeddings = torch.nn.functional.normalize(doc_embeddings, p=2, dim=1)
    
    def get_scores(query_text):
        try:
            # Encode query
            query_embedding = model.encode(
                query_text,
                convert_to_tensor=True,
                show_progress_bar=False
            )
            
            # Move to CPU and normalize
            query_embedding = query_embedding.cpu()
            query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=0)
            
            # Calculate cosine similarity efficiently
            similarities = torch.matmul(
                doc_embeddings, 
                query_embedding
            )
            
            return similarities.numpy()
            
        except Exception as e:
            print(f"Error in dual encoder scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_scores

#### Evaluate Models


In [16]:
# Select a subset of data for querying (e.g., 100 random samples)
query_data = data.sample(n=100, random_state=42)

In [17]:
dual_encoder = create_dual_encoder_model(data['processed_review'].tolist())
dual_encoder_score = evaluate_model(dual_encoder, query_data, data, 'dual_encoder')


.gitattributes:   0%|          | 0.00/791 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O1.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O2.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O3.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O4.onnx:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512_vnni.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_quint8_avx2.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

openvino/openvino_model.xml:   0%|          | 0.00/212k [00:00<?, ?B/s]

openvino_model_qint8_quantized.bin:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

(…)nvino/openvino_model_qint8_quantized.xml:   0%|          | 0.00/368k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Computing document embeddings...


Batches:   0%|          | 0/118 [00:00<?, ?it/s]

Number of evaluated queries: 100
MSE Average for dual_encoder model: 0.3694
MSE Standard Deviation for dual_encoder model: 0.5199
