In [None]:
import os 
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

In [17]:
parent_dir = os.path.dirname(os.getcwd())
json_path = os.path.join(parent_dir, 'data','cleaned','cleaned_dataset_tripadvisor-reviews_2025-11-01_14-21-09-431.json')

df = pd.read_json(json_path, orient='records')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19493 entries, 0 to 19492
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             19493 non-null  int64 
 1   lang           19493 non-null  object
 2   rating         19493 non-null  int64 
 3   travelDate     19466 non-null  object
 4   publishedDate  19493 non-null  object
 5   tripType       17379 non-null  object
 6   userLocation   11260 non-null  object
 7   review_text    19493 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.2+ MB


Unnamed: 0,id,lang,rating,travelDate,publishedDate,tripType,userLocation,review_text
0,1016346537,el,1,2025-07-01,2025-07-03,BUSINESS,,Απαράδεκτο grecotel.. Απαράδεκτο! Έφτασα μετά ...
1,1015574543,en,5,2025-06-01,2025-06-28,FAMILY,,Family vacation. I was afraid of what the acco...
2,987362026,el,3,2024-12-01,2025-01-02,COUPLES,,Αδιαφορο. Το ξενοδοχείο χρήζει ανακαίνισης. Τα...
3,979445372,tr,1,2024-11-01,2024-11-10,FAMILY,,Kahvaltı kuyruğu ve personelin kabalığı. Çocuk...
4,960799391,en,5,2023-08-01,2024-07-23,COUPLES,,Place to stay in Alexandropoli. Wonderful stay...


In [18]:
# Convert data types
df['travelDate'] = pd.to_datetime(df['travelDate'], errors='coerce')
df['publishedDate'] = pd.to_datetime(df['publishedDate'], errors='coerce')

In [19]:
# Keep only relevant data
df_filtered = df[df['travelDate'] >= '2023-06-01'].copy()

# Keep only english reviews for now
df_filtered = df_filtered[df_filtered['lang'] == 'en']

# Keep only relevant columns
df_filtered = df_filtered[['id', 'review_text']]
df_filtered.head()

Unnamed: 0,id,review_text
1,1015574543,Family vacation. I was afraid of what the acco...
4,960799391,Place to stay in Alexandropoli. Wonderful stay...
5,960211568,The worst room i’ve ever stayed in Greece.. Th...
6,952565999,"Large, modern hotel on the coast, with reasona..."
8,917123208,Excellent located. Nice beach and excellent lo...


In [20]:
# Define Aspect Keywords
ASPECT_KEYWORDS = {
    "ROOM": ["room", "suite", "ac", "air conditioning", "view", "balcony", "cleaned daily", "comfortable"],
    "STAFF": ["staff", "reception", "waiter", "host" ,"hostess", "housekeeping", "concierge", "manager", "friendly", "helpful", "rude"],
    "FOOD": ["food", "restaurant", "breakfast", "buffet" ,"dinner", "bar", "ice cream", "drink", "variety", "choice", "cold"],
    "SERVICE": ["service", "room service", "customer service"],
    "FACILITIES": ["pool", "beach", "water park", "sunbed", "sun bed", "location", "view", "sea view"]
}

In [21]:
vader = SentimentIntensityAnalyzer()

def get_vader_sentiment(sentence):
    """
    Returns 'positive', 'negative', or 'neutral'
    based on the VADER compound score.
    """
    compound_score = vader.polarity_scores(sentence)['compound']
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Main Labeling Function 
def create_absa_dataset(df: pd.DataFrame, aspect_map: dict):
    """
    Iterates through reviews and sentences to build
    a programmatically labeled ABSA dataset.
    """
    labeled_data = []
    
    for index, row in df.iterrows():
        review_id = row['id']
        full_review_text = row['review_text']
        
        # Skip null reviews
        if pd.isna(full_review_text):
            continue
            
        # Split the review into individual sentences
        sentences = sent_tokenize(full_review_text)
    
        for sentence in sentences:
            sentence_lower = sentence.lower()
            found_aspects = set()
            
            # Check this sentence for each of our aspect keywords
            for aspect, keywords in aspect_map.items():
                for keyword in keywords:
                    if keyword in sentence_lower:
                        found_aspects.add(aspect)
            
            if found_aspects:
                # Get the sentiment of the sentence
                sentiment = get_vader_sentiment(sentence)
                
                # Add a new row for each aspect found
                for aspect in found_aspects:
                    labeled_data.append({
                        'review_id': review_id,
                        'aspect': aspect,
                        'sentence': sentence,
                        'sentiment': sentiment
                    })
                    
    # Convert the final list into a new DataFrame
    absa_df = pd.DataFrame(labeled_data)
    
    # Remove duplicates
    absa_df = absa_df.drop_duplicates().reset_index(drop=True)
    
    return absa_df

In [22]:
df_absa = create_absa_dataset(df_filtered, ASPECT_KEYWORDS)
df_absa.info()
df_absa.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56803 entries, 0 to 56802
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review_id  56803 non-null  int64 
 1   aspect     56803 non-null  object
 2   sentence   56803 non-null  object
 3   sentiment  56803 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


Unnamed: 0,review_id,aspect,sentence,sentiment
0,1015574543,ROOM,Family vacation.,neutral
1,1015574543,FOOD,I was afraid of what the accomodation would be...,positive
2,1015574543,FACILITIES,I was afraid of what the accomodation would be...,positive
3,1015574543,ROOM,I was afraid of what the accomodation would be...,positive
4,960799391,ROOM,Place to stay in Alexandropoli.,neutral
