# TripAdvisor Recommendation System


In [71]:
## 1. Import Libraries
import pandas as pd
import numpy as np
import ast
from rank_bm25 import BM25Okapi
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


In [43]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/alexs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/alexs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
# Load the CSV file
df = pd.read_csv('reviews.csv')

print(f"Original DataFrame shape: {df.shape}")

# Convert the 'ratings' column from string to dictionary
df['ratings'] = df['ratings'].apply(ast.literal_eval)

# Define required aspects
required_aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]

# Filter rows with at least the required aspects
df_filtered = df[df['ratings'].apply(lambda x: all(aspect in x for aspect in required_aspects))]

print(f"DataFrame shape after filtering: {df_filtered.shape}")

if df_filtered.empty:
    print("No reviews found with all required aspects. Printing unique aspects found in the dataset:")
    all_aspects = set()
    for rating in df['ratings']:
        all_aspects.update(rating.keys())
    print(sorted(all_aspects))
    data = pd.DataFrame(columns=['offering_id'] + required_aspects + ['reviews'])
else:
    # Group by offering_id
    data = df_filtered.groupby('offering_id').agg({
        'text': ' '.join,  # Concatenate all reviews
        'ratings': list  # Keep all ratings
    }).reset_index()

    # Calculate average ratings for each aspect
    for aspect in required_aspects:
        data[aspect] = data['ratings'].apply(lambda x: np.mean([review.get(aspect, np.nan) for review in x]))

    # Rename 'text' column to 'reviews'
    data = data.rename(columns={'text': 'reviews'})

    # Select and order the final columns
    final_columns = ['offering_id'] + required_aspects + ['reviews']
    data = data[final_columns]

# Print the first few rows and shape of the processed data
print(data.head())
print("\nDataFrame shape:", data.shape)

# Print column names to verify
print("\nColumn names:", data.columns.tolist())

Original DataFrame shape: (878561, 10)
DataFrame shape after filtering: (436391, 10)
   offering_id   service  cleanliness   overall     value  location  \
0        72572  4.601010     4.636364  4.388889  4.323232  4.570707   
1        72579  4.232000     4.240000  3.888000  4.152000  4.192000   
2        72586  4.250000     4.287879  4.045455  4.053030  4.537879   
3        72598  3.243243     3.243243  2.918919  3.054054  3.027027   
4        73236  4.277778     3.111111  3.388889  3.777778  4.111111   

   sleep_quality     rooms                                            reviews  
0       4.333333  4.282828  I had to make fast visit to seattle and I foun...  
1       3.768000  3.856000  Great service, rooms were clean, could use som...  
2       4.113636  3.992424  Beautiful views of the space needle - especial...  
3       3.270270  3.189189  This hotel is in need of some serious updates....  
4       3.722222  3.222222  My experience at this days inn was perfect. th...  

DataFra

In [45]:
## 3. Text Preprocessing

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    
    return ' '.join(tokens)

# Apply text preprocessing to reviews
data['processed_review'] = data['reviews'].apply(preprocess_text)



#### Model Definition and eval function



In [72]:

def create_bm25_model(corpus):
    try:
        tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
        bm25 = BM25Okapi(tokenized_corpus)
        return bm25
    except Exception as e:
        print(f"Error creating BM25 model: {e}")
        return None

In [73]:
def create_hybrid_model_with_dense_retriever(corpus, alpha=0.85, beta=0.15):
    """
    Creates a hybrid model combining BM25 and SentenceTransformer (all-mpnet-base-v2)
    with proper tensor handling for Apple Silicon
    """
    # Initialize models
    tokenized_corpus = [doc.split() for doc in corpus if isinstance(doc, str)]
    bm25 = BM25Okapi(tokenized_corpus)
    dense_model = SentenceTransformer('all-mpnet-base-v2')
    
    # Pre-compute document embeddings and move to CPU
    doc_embeddings = dense_model.encode(corpus, convert_to_tensor=True)
    doc_embeddings = doc_embeddings.cpu()  # Move to CPU
    
    def get_hybrid_scores(query_text):
        try:
            # 1. Get BM25 scores
            query_tokens = query_text.split()
            bm25_scores = np.array(bm25.get_scores(query_tokens))
                    
            # 2. Get dense retriever scores
            query_embedding = dense_model.encode(query_text, convert_to_tensor=True)
            query_embedding = query_embedding.cpu()  # Move to CPU
            
            # Reshape embeddings for cosine similarity
            query_embedding_reshaped = query_embedding.reshape(1, -1)
            doc_embeddings_reshaped = doc_embeddings.reshape(len(corpus), -1)
            
            # Calculate cosine similarity
            dense_scores = cosine_similarity(query_embedding_reshaped, doc_embeddings_reshaped)[0]
              
            # Combine scores
            final_scores = alpha * bm25_scores + beta * dense_scores
            
            return final_scores
            
        except Exception as e:
            print(f"Error in hybrid scoring: {e}")
            return np.zeros(len(corpus))
    
    return get_hybrid_scores

In [74]:
def evaluate_model(model, query_data, full_data, model_type='hybrid', k=5):
    """
    Improved evaluation function for recommendation models
    Args:
        model: The model to evaluate (BM25 or hybrid)
        query_data: DataFrame containing query samples
        full_data: Complete DataFrame with all hotels
        model_type: 'bm25' or 'hybrid'
        k: Number of similar hotels to consider
    Returns:
        float: Average MSE across all queries
    """
    mse_scores = []
    aspects = ["service", "cleanliness", "overall", "value", "location", "sleep_quality", "rooms"]
    
    for idx, query in query_data.iterrows():
        try:
            # Get similarity scores based on model type
            if model_type == 'bm25':
                tokenized_query = query['processed_review'].split()
                scores = model.get_scores(tokenized_query)
            else:  # hybrid model
                scores = model(query['processed_review'])
            
            # Create mask to exclude the query hotel itself
            mask = np.ones(len(scores), dtype=bool)
            query_hotel_id = query['offering_id']
            mask[full_data['offering_id'] == query_hotel_id] = False
            
            # Apply mask and get top-k indices
            masked_scores = scores[mask]
            masked_indices = np.argsort(masked_scores)[-k:]
            
            # Map masked indices back to original indices
            top_k_idx = np.where(mask)[0][masked_indices]
            
            # Calculate ratings
            similar_ratings = []
            query_ratings = []
            
            for aspect in aspects:
                # Get average rating for similar hotels
                avg_rating = full_data.iloc[top_k_idx][aspect].mean()
                similar_ratings.append(avg_rating)
                query_ratings.append(query[aspect])
            
            # Calculate MSE
            mse = mean_squared_error(query_ratings, similar_ratings)
            mse_scores.append(mse)
            
        except Exception as e:
            print(f"Error processing query {idx}: {e}")
            continue
    
    # Return average MSE
    if not mse_scores:
        print("Warning: No valid evaluations were performed")
        return float('inf')
    
    avg_mse = np.mean(mse_scores)
    std_mse = np.std(mse_scores)
    print(f"Number of evaluated queries: {len(mse_scores)}")
    print(f"MSE Average for {model_type} model: {avg_mse:.4f}")
    print(f"MSE Standard Deviation for {model_type} model: {std_mse:.4f}")
    
    return avg_mse

#### Evaluate Models


In [75]:
# Select a subset of data for querying (e.g., 100 random samples)
query_data = data.sample(n=100, random_state=42)

In [76]:

# Create models using the full dataset
bm25_model_full = create_bm25_model(data['processed_review'])

bm25_score = evaluate_model(bm25_model_full, query_data, data, 'bm25')


Number of evaluated queries: 100
MSE Average for bm25 model: 0.4885
MSE Standard Deviation for bm25 model: 0.8819


In [77]:
from sentence_transformers import util

# Create and evaluate the hybrid model
hybrid_model = create_hybrid_model_with_dense_retriever(
    corpus=data['processed_review'],
    alpha=0.75,  # Weight for BM25
    beta=0.25    # Weight for dense retriever
)

# Evaluate the model
hybrid_score = evaluate_model(hybrid_model, query_data, data, 'hybrid')



Number of evaluated queries: 100
MSE Average for hybrid model: 0.4891
MSE Standard Deviation for hybrid model: 0.8818
