In [1]:
import os 
import time
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Literal, Tuple
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
parent_dir = os.path.dirname(os.getcwd())
data_path = os.path.join(parent_dir, 'data','cleaned','cleaned_dataset_tripadvisor-reviews_2025-11-01_14-21-09-431.json')

df = pd.read_json(data_path, orient='records')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19493 entries, 0 to 19492
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             19493 non-null  int64 
 1   lang           19493 non-null  object
 2   rating         19493 non-null  int64 
 3   travelDate     19466 non-null  object
 4   publishedDate  19493 non-null  object
 5   tripType       17379 non-null  object
 6   userLocation   11260 non-null  object
 7   review_text    19493 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.2+ MB


Unnamed: 0,id,lang,rating,travelDate,publishedDate,tripType,userLocation,review_text
0,1016346537,el,1,2025-07-01,2025-07-03,BUSINESS,,Απαράδεκτο grecotel.. Απαράδεκτο! Έφτασα μετά ...
1,1015574543,en,5,2025-06-01,2025-06-28,FAMILY,,Family vacation. I was afraid of what the acco...
2,987362026,el,3,2024-12-01,2025-01-02,COUPLES,,Αδιαφορο. Το ξενοδοχείο χρήζει ανακαίνισης. Τα...
3,979445372,tr,1,2024-11-01,2024-11-10,FAMILY,,Kahvaltı kuyruğu ve personelin kabalığı. Çocuk...
4,960799391,en,5,2023-08-01,2024-07-23,COUPLES,,Place to stay in Alexandropoli. Wonderful stay...


In [4]:
df = df[['id', 'lang', 'review_text']]
df = df[df['lang'] == 'en'].reset_index(drop=True)

In [5]:
df.shape

(11989, 3)

In [6]:
client = OpenAI(api_key=os.getenv("grecotel_data_labelling_key"))


In [7]:
# Download the necessary NLTK data for sentence splitting
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt')

In [8]:
# Define valid aspects and sentiments
VALID_ASPECTS = [
    "ROOM", "FOOD", "SERVICE", "FACILITIES", 
    "LOCATION", "VALUE_FOR_MONEY", "CLEANLINESS", "COMFORT"
]
VALID_SENTIMENTS = ["positive", "negative"]

# Define Pydantic Models for Validation ---

class AspectSentimentPair(BaseModel):
    """Represents a single aspect and its sentiment found in a sentence."""
    aspect: Literal[
        "ROOM", "FOOD", "SERVICE", "FACILITIES", 
        "LOCATION", "VALUE_FOR_MONEY", "CLEANLINESS", "COMFORT"
    ]
    sentiment: Literal["positive", "negative"]

class SentenceAnalysis(BaseModel):
    """Represents the complete analysis of a single sentence."""
    results: List[AspectSentimentPair] = Field(
        description="A list of aspect-sentiment pairs identified in the sentence. Empty if no relevant aspects are found."
    )

In [9]:

# Define LLM interaction function
def analyze_sentence_with_llm(sentence: str) -> List[dict]:
    """
    Sends a sentence to gpt-4o-mini to extract aspects and sentiments.
    Returns a list of dictionaries (or empty list if no aspects found).
    """
    time.sleep(3)
    system_prompt = f"""

    You are an expert in Aspect-Based Sentiment Analysis (ABSA) for hotel reviews.
    Your task is to analyze a given sentence from a hotel review and identify:
    1. The Aspects mentioned (explicitly or implicitly).
    2. The Sentiment associated with each aspect.

    Allowed Aspects: {", ".join(VALID_ASPECTS)}
    Allowed Sentiments: {", ".join(VALID_SENTIMENTS)}

    Rules:
    - If a sentence mentions multiple aspects, extract all of them.
    - If a sentence has no relevant aspects from the list, return an empty list.
    - Be precise. Implicit aspects are CRITICAL. You must infer the aspect from context.

    Exampes:

    Input: 'The staff was incredibly friendly and helpful.'
    Output: {{ "results": [ {{ "aspect": "SERVICE", "sentiment": "positive" }} ] }}

    Input: 'The walls were paper thin and I could hear everything next door.'
    Output: {{ "results": [ {{ "aspect": "COMFORT", "sentiment": "negative" }}, {{ "aspect": "ROOM", "sentiment": "negative" }} ] }}

    Input: 'Great location near the beach, but the breakfast was cold and tasteless.'
    Output: {{ "results": [ {{ "aspect": "LOCATION", "sentiment": "positive" }}, {{ "aspect": "FOOD", "sentiment": "negative" }} ] }}

    Input: 'It was way too expensive for what we got.'
    Output: {{ "results": [ {{ "aspect": "VALUE_FOR_MONEY", "sentiment": "negative" }} ] }}

    Input: 'There was mold in the shower.'
    Output: {{ "results": [ {{ "aspect": "CLEANLINESS", "sentiment": "negative" }}, {{ "aspect": "ROOM", "sentiment": "negative" }} ] }}
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Analyze: '{sentence}'"}
    ]

    
    completion = client.chat.completions.parse(
        model="gpt-4o-mini",
        messages=messages,
        response_format=SentenceAnalysis
    )

    analysis = completion.choices[0].message.parsed
    # Convert Pydantic objects to simple dictionaries for the DataFrame
    return [item.model_dump() for item in analysis.results]

In [10]:
def preprocess_reviews_dataframe(df: pd.DataFrame, 
                                 text_column: str = "review_text", 
                                 id_column: str = "id",
                                 sample_size: int = 5) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Takes a DataFrame of reviews, splits them into sentences, labels them with an LLM,
    and returns a new "long-format" DataFrame.
    """
    df = df.sample(n=sample_size)
    labeled_records = []
    total_reviews = len(df)

    print(f"Starting processing of {total_reviews} reviews...")

    for idx, row in df.iterrows():
        review_id = row[id_column]
        review_text = row[text_column]
        sentences = sent_tokenize(review_text)

        for sentence in sentences:
            llm_results = analyze_sentence_with_llm(sentence)

            if llm_results:
                for item in llm_results:
                    labeled_records.append({
                        "review_id": review_id,
                        "sentence": sentence,
                        "aspect": item['aspect'],
                        "sentiment": item['sentiment']
                    })

    # Create the final DataFrame
    labeled_df = pd.DataFrame(labeled_records)
    return labeled_df, df

In [11]:
# Run the labeling on a small sample first to test cost/quality
final_labeled_df, sampled_df = preprocess_reviews_dataframe(df) 

# Display Sampled Input
print("\n--- Sampled Input Data (Review Texts) ---")
print(sampled_df[['id', 'review_text']])

# Display Results
print("\n--- Labeled Data Preview (Results) ---")
print(final_labeled_df)

Starting processing of 5 reviews...

--- Sampled Input Data (Review Texts) ---
              id                                        review_text
8565   258803788  Excellent hotel!!. The staff were very welcomi...
8970   596911841  nice but expensive. What it make me feel unhap...
8245   816481271  First review was removed. So I wrote my first ...
5655   686140126  The best Hotel in Corfu: a feast for the eye!....
11303  211026160  Breathtaking views. All inclusive is a must or...

--- Labeled Data Preview (Results) ---
    review_id                                           sentence  \
0   258803788  The staff were very welcoming, the room was gr...   
1   258803788  The staff were very welcoming, the room was gr...   
2   258803788                          The place was immaculate.   
3   258803788  My favorite part was the breakfast, one of the...   
4   258803788         Free high-speed wifi throughout the hotel.   
5   258803788  Perfect location in the heart of Athens, close... 

In [12]:
# Save to JSON
output_sample_reviews_filename = "sampled_reviews.json"
sampled_df.to_json(output_sample_reviews_filename, orient='records', indent=2)
print(f"\nSaved sampled reviews to {output_sample_reviews_filename}")
output_filename = "llm_labeled_dataset.json"
final_labeled_df.to_json(output_filename, orient='records', indent=2)
print(f"\nSaved labeled data to {output_filename}")



Saved sampled reviews to sampled_reviews.json

Saved labeled data to llm_labeled_dataset.json
