In [None]:
from langchain_together import TogetherEmbeddings
import pandas as pd
import numpy as np

df = pd.read_excel('reviews_data.xlsx')
df

import os
import getpass
if not os.getenv("TOGETHER_API_KEY"):
    os.environ["TOGETHER_API_KEY"] = "together_api"

# Initialize the TogetherEmbeddings model
embeddings = TogetherEmbeddings(
    model="togethercomputer/m2-bert-80M-8k-retrieval"  # Example model; check docs for Llama-based alternatives
)

reviews = df["Review"].tolist()

# Process embeddings in batches
embedding_list = []
for i in range(0, len(reviews), 128):
    batch = reviews[i : i + 128]  # Get batch
    batch_embeddings = embeddings.embed_documents(batch)  # Generate embeddings for batch
    embedding_list.extend(batch_embeddings)  # Store results
    print(f"Processed {i + len(batch)} / {len(reviews)} reviews")




Processed 128 / 8382 reviews
Processed 256 / 8382 reviews
Processed 384 / 8382 reviews
Processed 512 / 8382 reviews
Processed 640 / 8382 reviews
Processed 768 / 8382 reviews
Processed 896 / 8382 reviews
Processed 1024 / 8382 reviews
Processed 1152 / 8382 reviews
Processed 1280 / 8382 reviews
Processed 1408 / 8382 reviews
Processed 1536 / 8382 reviews
Processed 1664 / 8382 reviews
Processed 1792 / 8382 reviews
Processed 1920 / 8382 reviews
Processed 2048 / 8382 reviews
Processed 2176 / 8382 reviews
Processed 2304 / 8382 reviews
Processed 2432 / 8382 reviews
Processed 2560 / 8382 reviews
Processed 2688 / 8382 reviews
Processed 2816 / 8382 reviews
Processed 2944 / 8382 reviews
Processed 3072 / 8382 reviews
Processed 3200 / 8382 reviews
Processed 3328 / 8382 reviews
Processed 3456 / 8382 reviews
Processed 3584 / 8382 reviews
Processed 3712 / 8382 reviews
Processed 3840 / 8382 reviews
Processed 3968 / 8382 reviews
Processed 4096 / 8382 reviews
Processed 4224 / 8382 reviews
Processed 4352 / 

In [2]:
metadata_list = df.apply(lambda row: {
    "customer_id": int(row["customer_id"]),
    "review_date": row["review_date_numeric"],
    "Rating": int(row["Rating"]),
    "review_id":row['review_id']
}, axis=1).tolist()


metadata_list[0]



{'customer_id': 5380, 'review_date': 20241104, 'Rating': 6, 'review_id': 11722}

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key='pinecone_api'
)


pc.list_indexes()


pc.create_index(
    name='hotel-reviews',
    dimension=768,
    metric='cosine',
    deletion_protection='enabled',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "hotel-reviews",
    "metric": "cosine",
    "host": "hotel-reviews-nn96vhk.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "enabled",
    "tags": null
}

In [None]:
index = pc.Index(host="my_host")

vectors = [
    (str(i), embedding, metadata_list[i])
    for i, embedding in enumerate(embedding_list)
]

batch_size = 100  # Adjust this number as needed.

for i in range(0, len(embedding_list), batch_size):
    batch_vectors = [
        (str(i + j), embedding_list[i + j], metadata_list[i + j])
        for j in range(min(batch_size, len(embedding_list) - i))
    ]
    index.upsert(vectors=batch_vectors)
    print(f"Upserted batch from {i} to {i + len(batch_vectors)}")


query_embedding = embeddings.embed_query("What are some of the reviews that mention restaurant, food, lunch, breakfast, dinner")


results = index.query(
    vector=query_embedding,
    top_k=5,
    namespace="",
    include_metadata=True,
    filter={
        "Rating": {"$lte": 9},
        "review_date": {"$gte": 20240101, "$lte": 20240108}
    }
)


matches = results["matches"]

Upserted batch from 0 to 100
Upserted batch from 100 to 200
Upserted batch from 200 to 300
Upserted batch from 300 to 400
Upserted batch from 400 to 500
Upserted batch from 500 to 600
Upserted batch from 600 to 700
Upserted batch from 700 to 800
Upserted batch from 800 to 900
Upserted batch from 900 to 1000
Upserted batch from 1000 to 1100
Upserted batch from 1100 to 1200
Upserted batch from 1200 to 1300
Upserted batch from 1300 to 1400
Upserted batch from 1400 to 1500
Upserted batch from 1500 to 1600
Upserted batch from 1600 to 1700
Upserted batch from 1700 to 1800
Upserted batch from 1800 to 1900
Upserted batch from 1900 to 2000
Upserted batch from 2000 to 2100
Upserted batch from 2100 to 2200
Upserted batch from 2200 to 2300
Upserted batch from 2300 to 2400
Upserted batch from 2400 to 2500
Upserted batch from 2500 to 2600
Upserted batch from 2600 to 2700
Upserted batch from 2700 to 2800
Upserted batch from 2800 to 2900
Upserted batch from 2900 to 3000
Upserted batch from 3000 to 310

In [5]:
# Extract review_ids from the matches (convert them to int if necessary)
matched_ids = [int(match["metadata"]["review_id"]) for match in matches]


req_df = df[df["review_id"].isin(matched_ids)]

req_df['Review']

concatenated_reviews = " ".join(req_df["Review"].tolist())

concatenated_reviews

' Shower taps need descaling horrid smell from drains the new booth seating is so unfriendly the older decor made more sense New menu card room dining not advisable limited choice of average food Tea cakes were cold from fridge so cream in the cakes was There were no more small tasty affordable snacks like the crab cakes of tandoori salmon or tatty etc Pity You lost 15 customers who went to eat elsewhere daily for 3 days   location  The people working on the terrace bar are pretty slow preparing beverages so you need to be patient awaiting your order especially when you can see quite a lot of people up there We were forced to cancel our order after 40 minutes waiting as we had another appointment planned after   The location of the hotel is just perfect 150m to Diagonal Metro station Rambla is just 300 m away So for our family tour with some sightseeing and enjoying good restaurants this was a perfect choice The hotel is very comfortable and quiet You can have rest on the terrace bar w

In [6]:
from together import Together
import os
client = Together()
response = client.chat.completions.create(
    model="meta-llama/Llama-Vision-Free",
    messages = [{"role": "user", "content": f"""Briefly Summarize the overall sentiment of customers about food and restaurant based on these reviews - {
    concatenated_reviews}. Dont mention the name of the hotel"""}]
)
print(response.choices[0].message.content)

Here's a summary of the overall sentiment of customers about food and restaurants based on the reviews:

**Negative Sentiment:**

* Food quality was average, with some dishes (like tea cakes) being cold or mediocre.
* Limited menu options and snacks.
* High prices, especially for breakfast and certain items like mini-bar items.
* Long wait times for orders, especially at the Chico Latino bar.
* Unfriendly and slow service at some restaurants (e.g. the terrace bar).
* Some customers felt ripped off, like the hotel charging them for tourist tax in advance.

**Positive Sentiment:**

* Great location, with many restaurants and attractions nearby.
* Excellent room facilities and amenities.
* Friendly and helpful staff, including concierge staff.
* Good atmosphere and ambiance in some restaurants, like the Chico Latino bar.
* One restaurant (STK) was mentioned as being better suited for lunch rather than dinner.

Overall, the sentiment is more negative towards the food and restaurants, with 