### 1. Set up Weaviate

In [2]:
pip show protobuf

Name: protobuf
Version: 3.20.3
Summary: Protocol Buffers
Home-page: https://developers.google.com/protocol-buffers/
Author: 
Author-email: 
License: BSD-3-Clause
Location: c:\Users\ong_z\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: 
Required-by: google-api-core, googleapis-common-protos, grpcio-health-checking, grpcio-status, grpcio-tools, mediapipe, mlflow-skinny, onnxruntime, opentelemetry-proto, proto-plus, pyabsa, ray, streamlit, tb-nightly, tensorboard, tensorboardX, tensorflow-datasets, tensorflow-intel, tensorflow-metadata, tf_nightly_intel
Note: you may need to restart the kernel to use updated packages.




In [14]:
import pandas as pd
import json
import re
import ast
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from weaviate import Client
from weaviate.classes.init import Auth
from weaviate.classes.config import DataType, Property, ReferenceProperty, VectorDistances
from weaviate.classes.query import QueryReference
import weaviate.classes as wvc
from weaviate.classes.config import Configure
from weaviate.collections.classes.filters import Filter

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [15]:
property_df_flat = pd.read_csv('property_df_flat.csv')
review_chunks = pd.read_csv('review_chunks_with_sentiment_embeddings.csv')
hotel_topic_summary = pd.read_csv('topic_sentiment_hotel_summary.csv')

In [16]:
import weaviate

client = weaviate.connect_to_custom(
    http_host="34.87.173.122",
    http_port=8080,
    http_secure=False,
    grpc_host="34.87.173.122",
    grpc_port=50051,
    grpc_secure=False,
    skip_init_checks=True
)

print("✅ Client ready:", client.collections.list_all())


✅ Client ready: {'HotelTopicSummary': _CollectionConfigSimple(name='HotelTopicSummary', description=None, generative_config=None, properties=[_Property(name='hotel_name', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none', vectorizer_configs=None), _Property(name='main_topic_category', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none', vectorizer_configs=None), _Property(name='sub_topic_category', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none

### 2. Create Schema

In [None]:
# Clean up old collections (optional)
# if client.collections.exists("Hotel"):
#     client.collections.delete("Hotel")

# if client.collections.exists("ReviewChunk"):
#     client.collections.delete("ReviewChunk")

# if client.collections.exists("HotelTopicSummary"):
#     client.collections.delete("HotelTopicSummary")

In [32]:
# Create collections
# ✅ Hotel Collection
client.collections.create(
    name="Hotel",
    properties=[
        Property(name="hotel_name", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="hotel_url", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="original_price", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="current_price", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="num_reviews", data_type=DataType.INT, skip_vectorization=True),
        Property(name="review_score", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="review_label", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="star_rating", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="preferred_partner", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="sustainability_certified", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="last_updated", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="hotel_id", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="best_review_score_label", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="best_review_score_rating", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="address", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="latitude", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="longitude", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="description", data_type=DataType.TEXT),
        Property(name="check_in_time", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="check_out_time", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="children_policies", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="age_restriction", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="smoking_policy", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="pets_policy", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="facilities_text", data_type=DataType.TEXT),
        Property(name="review_scores_text", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="cot_extra_bed_policies_text", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="payment_methods_text", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="room_details_text", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="surroundings_text", data_type=DataType.TEXT),
    ],
    vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE),
    inverted_index_config=Configure.inverted_index(index_property_length=True),
)

# ✅ ReviewChunk Collection
client.collections.create(
    name="ReviewChunk",
    properties=[
        Property(name="review_id", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="chunk_id", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="hotel_name", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="review_score", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="sentiment", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="chunk_text", data_type=DataType.TEXT),  # ✅ vectorized
        Property(name="reviewer_name", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="reviewer_country", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="review_room_name", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="review_num_nights", data_type=DataType.INT, skip_vectorization=True),
        Property(name="review_stay_date", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="review_traveler_type", data_type=DataType.TEXT, skip_vectorization=True)
    ],
    references=[
        ReferenceProperty(name="hotel", target_collection="Hotel")
    ],
    vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE),
    inverted_index_config=Configure.inverted_index(index_property_length=True),
)


<weaviate.collections.collection.sync.Collection at 0x211b8e3a080>

In [19]:
hotel_topic_summary.head()

Unnamed: 0,hotel_name,main_topic_category,sub_topic_category,negative_count,positive_count,total_count,positive_sentiment_ratio,bayesian_score
0,30 Bencoolen,amenities,accessibility,1,3,4,0.75,0.495523
1,30 Bencoolen,amenities,in_room,7,0,7,0.0,0.140977
2,30 Bencoolen,amenities,leisure,35,46,81,0.567901,0.628465
3,30 Bencoolen,amenities,tech,6,3,9,0.333333,0.143926
4,30 Bencoolen,bathroom,bathroom_cleanliness,9,1,10,0.1,0.083781


In [20]:
# ✅ HotelTopicSummary Collection
client.collections.create(
    name="HotelTopicSummary",
    properties=[
        Property(name="hotel_name", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="main_topic_category", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="sub_topic_category", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="positive_count", data_type=DataType.INT, skip_vectorization=True),
        Property(name="negative_count", data_type=DataType.INT, skip_vectorization=True),
        Property(name="total_count", data_type=DataType.INT, skip_vectorization=True),
        Property(name="positive_sentiment_ratio", data_type=DataType.NUMBER, skip_vectorization=True),
        Property(name="bayesian_score", data_type=DataType.NUMBER, skip_vectorization=True),
    ],
    references=[
        ReferenceProperty(name="hotel", target_collection="Hotel")
    ],
    vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE),
    inverted_index_config=Configure.inverted_index(index_property_length=True),
)

<weaviate.collections.collection.sync.Collection at 0x232960d0f70>

### 3. Upload Data

In [33]:
# Upload Hotel Objects
hotel_collection = client.collections.get("Hotel")
hotel_name_to_uuid = {}

for _, row in tqdm(property_df_flat.iterrows(), total=len(property_df_flat)):
    try:
        hotel_obj = row.dropna().to_dict()
        # Insert the hotel and store the returned UUID
        uuid = hotel_collection.data.insert(hotel_obj)
        hotel_name_to_uuid[row["hotel_name"]] = uuid
    except Exception as e:
        print(f"❌ Failed to upload hotel: {row.get('hotel_name', '[unknown]')}, error: {e}")


  0%|          | 0/444 [00:00<?, ?it/s]

100%|██████████| 444/444 [00:25<00:00, 17.12it/s]


In [34]:
# Save the mapping of hotel names to UUIDs

hotel_name_to_uuid_str = {k: str(v) for k, v in hotel_name_to_uuid.items()}

with open("hotel_name_to_uuid.json", "w") as f:
    json.dump(hotel_name_to_uuid_str, f)


In [None]:
# Prepare ReviewChunk Data
review_chunks = pd.read_csv('review_chunks_with_sentiment_embeddings.csv')

def parse_numpy_string(s):
    return np.fromstring(s.strip("[]"), sep=" ").tolist()

review_chunks["embedding"] = review_chunks["embedding"].apply(parse_numpy_string)

2933

In [36]:
# Upload ReviewChunk Objects

review_collection = client.collections.get("ReviewChunk")

for _, row in tqdm(review_chunks.iterrows(), total=len(review_chunks)):
    try:
        hotel_name = row["hotel_name"]
        hotel_uuid = hotel_name_to_uuid.get(hotel_name)

        if not hotel_uuid:
            print(f"⚠️ Skipping chunk {row['chunk_id']} — hotel not found: {hotel_name}")
            continue

        embedding = row["embedding"]  

        review_obj = row.drop(labels=["embedding"]).dropna().to_dict()

        review_uuid = review_collection.data.insert(
            properties=review_obj,
            vector=embedding
        )

        review_collection.data.reference_add(
            from_uuid=review_uuid,
            from_property="hotel",
            to=hotel_uuid
        )

    except Exception as e:
        print(f"❌ Failed to upload review chunk {row.get('chunk_id', '[unknown]')} — {e}")


 30%|██▉       | 87666/293302 [1:21:21<951:55:34, 16.67s/it]

❌ Failed to upload review chunk 55026_Negative_0 — Server disconnected without sending a response.


 30%|██▉       | 87668/293302 [1:21:56<909:48:20, 15.93s/it] 

❌ Failed to upload review chunk 55027_Positive_0 — Server disconnected without sending a response.


 58%|█████▊    | 169941/293302 [2:36:38<544:08:47, 15.88s/it]

❌ Failed to upload review chunk 106960_Negative_0 — 


 86%|████████▌ | 252241/293302 [3:56:40<192:45:10, 16.90s/it]

❌ Failed to upload review chunk 159376_Negative_0 — 


 86%|████████▌ | 252242/293302 [3:57:17<240:00:24, 21.04s/it]

❌ Failed to upload review chunk 159377_Positive_0 — 


100%|██████████| 293302/293302 [4:35:59<00:00, 17.71it/s]    


In [21]:
# Upload HotelTopicSummary Objects
topic_summary_collection = client.collections.get("HotelTopicSummary")

# Open and read the file
with open("hotel_name_to_uuid.json", "r") as f:
    hotel_name_to_uuid = json.load(f)

# Loop over each row in your pivoted DataFrame
for _, row in tqdm(hotel_topic_summary.iterrows(), total=len(hotel_topic_summary)):
    try:
        hotel_name = row["hotel_name"]
        hotel_uuid = hotel_name_to_uuid.get(hotel_name)

        if not hotel_uuid:
            print(f"⚠️ Skipping topic summary — hotel not found: {hotel_name}")
            continue

        topic_obj = row.to_dict()

        # Insert into Weaviate
        topic_uuid = topic_summary_collection.data.insert(
            properties=topic_obj)

        # Add reference to the parent Hotel object
        topic_summary_collection.data.reference_add(
            from_uuid=topic_uuid,
            from_property="hotel",
            to=hotel_uuid
        )

    except Exception as e:
        print(f"❌ Failed to upload topic summary for hotel '{hotel_name}' — {e}")


100%|██████████| 8362/8362 [05:22<00:00, 25.94it/s]


### 4. Test Queries

In [41]:
# Test example 1 - Use chunk_id (Review collection) to retrieve details of Hotel collection

review_collection = client.collections.get("ReviewChunk")

results = review_collection.query.fetch_objects(
    filters=Filter.by_property("chunk_id").equal("177671_Negative_0"),
    return_references=[
        wvc.query.QueryReference(
            link_on="Hotel",
            return_properties=["hotel_name", "current_price", "star_rating", "description"]
        )
    ],
    include_vector=False,
    limit=1
)

hotel_ref = results.objects[0].references.get("hotel")
hotel_ref.objects

[Object(uuid=_WeaviateUUIDInt('3863b424-4c96-43d8-a182-9bc433aec2d9'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'hotel_name': 'Rest Chinatown Hotel', 'description': "Rest Chinatown Hotel is located a 5-minute walk from Chinatown MRT Station. Its modern rooms offer free WiFi, large projector screen and safety deposit boxes.\n\nLocated near to Temple and Smith Street, Rest Chinatown Hotel sits along Trengganu Street, a 5-minute drive from nightlife options in Clarke Quay. A train ride from Orchard Road shopping belt is a 10-minute journey away.\n\nThe modern rooms at Rest Chinatown Hotel come with in-room tea/coffee making facilities. En suite bathrooms offer toiletries and hot shower facilities.\n\nIn the hotel's shared area there is a coffee machine.", 'current_price': 166.0, 'star_rating': 3.0}, references=None, vector={}, collection='Hotel')]

In [42]:
# Test example 2 - Retrieve info from only Hotel collection
hotel_collection = client.collections.get("Hotel")

# Example: fetch all hotels with a specific name
filter_ = Filter.by_property("hotel_name").equal("Rest Chinatown Hotel")

results = hotel_collection.query.fetch_objects(filters=filter_, limit=1)
results.objects

[Object(uuid=_WeaviateUUIDInt('3863b424-4c96-43d8-a182-9bc433aec2d9'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'review_scores_text': 'Facilities: 7.6. Cleanliness: 8.0. Comfort: 8.0. Value for money: 7.6. Location: 9.2. Total: 7.6. Free WiFi: 7.8', 'sustainability_certified': 'No', 'check_out_time': 'Until 11:00', 'longitude': 103.84396434, 'last_updated': '18/3/2025 0:56', 'pets_policy': 'Pets are not allowed.', 'best_review_score_rating': 9.2, 'hotel_url': 'https://www.booking.com/hotel/sg/jinshan-hotel.en-gb.html?aid=304142&label=gen173nr-1FCAQoggJCEHNlYXJjaF9zaW5nYXBvcmVICVgEaMkBiAEBmAEJuAEXyAEM2AEB6AEB-AEDiAIBqAIDuAKRp-G-BsACAdICJDc3OGZmNDdhLWUzYmItNDljNi04ZjQ1LTYxZTRkMzA4OWE5Y9gCBeACAQ&ucfs=1&arphpl=1&checkin=2025-04-01&checkout=2025-04-02&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=14&hapos=86&sr_order=popul

In [48]:
# Test example 3 - Retrieve info from only Review collection
review_collection = client.collections.get("ReviewChunk")

# Example: fetch reviews with score lower or equal to 8
filter_ = Filter.by_property("review_score").equal(3)

results = review_collection.query.fetch_objects(filters=filter_, limit=50)

for obj in results.objects:
    print(f"Chunk ID: {obj.properties.get('chunk_id')}")
    print(f"Score: {obj.properties.get('review_score')}")
    print(f"Text: {obj.properties.get('chunk_text')}")
    print("-" * 50)


Chunk ID: 424_Positive_0
Score: 3.0
Text: Location is the only positive The location is great. Close to MRT, lots of food options and walking distance to many shopping centres
--------------------------------------------------
Chunk ID: 424_Negative_0
Score: 3.0
Text: It was very expensive for a very uncomfortable room. The beds were terrible, they were rock hard which made it impossible to have a good night sleep. The walls were very thin, we could hear everything, at times it was like people were in our room, it was that loud. The bathroom is a wet room, which means when you shower water gets everywhere, plus this meant the bathroom was not clean, mould on the ceiling could be seen. The entire room felt damp the whole stay.
--------------------------------------------------
Chunk ID: 439_Negative_0
Score: 3.0
Text: Uncomfortable and claustrophobic. The first room had no window 
Water was also on all surfaces including the ceiling and walls and a lot on the floorcondensation?? Changed

In [13]:
# Test example 4 - Retrieve info from only topic_summary_collection
topic_summary_collection = client.collections.get("HotelTopicSummary")

# Example: fetch reviews with score lower or equal to 8
filter_ = Filter.by_property("sub_topic_category").equal("accessibility")

results = topic_summary_collection.query.fetch_objects(filters=filter_, limit=50)

for obj in results.objects:
    print(f"Hotel Name: {obj.properties.get('hotel_name')}")
    print(f"Positive Sentiment Ratio: {obj.properties.get('positive_sentiment_ratio')}")
    print("-" * 50)


Hotel Name: 30 Bencoolen
Positive Sentiment Ratio: 0.75
--------------------------------------------------
Hotel Name: 7 Wonders Boutique Capsule
Positive Sentiment Ratio: 0.75
--------------------------------------------------
Hotel Name: 7 Wonders Hostel @ Boat Quay
Positive Sentiment Ratio: 0.5714285714285714
--------------------------------------------------
Hotel Name: 7 Wonders Hostel at Upper Dickson
Positive Sentiment Ratio: 0.375
--------------------------------------------------
Hotel Name: A Hotel Joo Chiat
Positive Sentiment Ratio: 1.0
--------------------------------------------------
Hotel Name: AMOY by Far East Hospitality
Positive Sentiment Ratio: 0.0
--------------------------------------------------
Hotel Name: Aerotel Singapore - Transit Hotel in Terminal 1
Positive Sentiment Ratio: 0.7777777777777778
--------------------------------------------------
Hotel Name: Ambassador Transit Hotel - Terminal 3
Positive Sentiment Ratio: 1.0
-------------------------------------

### 5. Hybrid RAG Setup

In [7]:
from sentence_transformers import SentenceTransformer

# Load your model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Create vector for the query
query_text = "vegan"
query_vector = model.encode(query_text).tolist()

review_collection = client.collections.get("ReviewChunk")

# Perform hybrid search with BYO vector
results = review_collection.query.hybrid(
    vector=query_vector,
    query=query_text,        # still needed for BM25 part
    alpha=0.5,               # 0 = pure BM25, 1 = pure vector
    return_metadata=["score"],
    return_properties=["hotel_name", "chunk_id", "sentiment", "chunk_text", "review_score"],
    return_references=[
        wvc.query.QueryReference(
            link_on="Hotel",
            return_properties=["hotel_name", "current_price", "star_rating", "description"]
        )
    ],
    limit=10
)

results.objects

[Object(uuid=_WeaviateUUIDInt('11e27e86-3fd3-4045-9f7f-11413539acf1'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=0.8570672869682312, explain_score=None, is_consistent=None, rerank_score=None), properties={'hotel_name': 'Marina Bay Sands', 'chunk_id': '70172_Negative_0', 'review_score': 9.0, 'sentiment': 'Negative', 'chunk_text': 'very few vegetarian or vegan options'}, references={'hotel': <weaviate.collections.classes.internal._CrossReference object at 0x000002180B726770>}, vector={}, collection='ReviewChunk'),
 Object(uuid=_WeaviateUUIDInt('e70f499b-75dc-4390-b653-1d5138632a37'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=0.8546596169471741, explain_score=None, is_consistent=None, rerank_score=None), properties={'hotel_name': 'Hotel Faber Park Singapore - Handwritten Collection', 'sentiment': 'Negative', 'review_score': 10.0, 'chunk_id': '151521_Negative_0', 'chu

In [8]:
# To access the objects attribute of the reference

for obj in results.objects:
    print(f"Review chunk: {obj.properties.get('chunk_text')}")
    print(f"Sentiment: {obj.properties.get('sentiment')}")
    print(f"Score: {obj.properties.get('review_score')}")

    hotel_ref = obj.references.get("hotel")  
    
    if hotel_ref and hotel_ref.objects:
        hotel_name = hotel_ref.objects[0].properties.get("hotel_name")
        hotel_star_rating = hotel_ref.objects[0].properties.get("star_rating")

        print("Hotel name:", hotel_name)
        print("Hotel star rating:", hotel_star_rating)
    else:
        print("No hotel reference found.")
    
    print("-" * 50)



Review chunk: very few vegetarian or vegan options
Sentiment: Negative
Score: 9.0
Hotel name: Marina Bay Sands
Hotel star rating: 5.0
--------------------------------------------------
Review chunk: More vegan options in breakfast
Sentiment: Negative
Score: 10.0
Hotel name: Hotel Faber Park Singapore - Handwritten Collection
Hotel star rating: 4.0
--------------------------------------------------
Review chunk: Vegan and vegetarian food choices very little
Sentiment: Negative
Score: 10.0
Hotel name: Marina Bay Sands
Hotel star rating: 5.0
--------------------------------------------------
Review chunk: Very few vegan options in restaurant.
Sentiment: Negative
Score: 8.0
Hotel name: Pan Pacific Singapore
Hotel star rating: 5.0
--------------------------------------------------
Review chunk: Very few vegan options in restaurant.
Sentiment: Negative
Score: 8.0
Hotel name: Pan Pacific Singapore
Hotel star rating: 5.0
--------------------------------------------------
Review chunk: Not enou