In [1]:
import json
import re

def clean_and_structure(data):
    """
    Cleans and structures the input data.
    Removes redundant, repetitive, and noisy entries, then organizes the content.

    Parameters:
        data (dict): A dictionary where keys are URLs and values are lists of text content.
    
    Returns:
        dict: A cleaned and structured dictionary with URLs as keys and cleaned content as values.
    """
    structured_data = {}

    for url, content in data.items():
        cleaned_content = []
        for entry in content:
            # Skip entries with navigation menus, placeholders, or policy terms
            if re.search(r'(Navigation|Terms|Policy|©|Error|Thank you)', entry, re.IGNORECASE):
                continue
            # Skip empty or meaningless strings
            if not entry.strip() or len(entry.strip()) < 3:
                continue
            # Add unique, cleaned entries to the list
            if entry not in cleaned_content:
                cleaned_content.append(entry.strip())
        
        # Add the cleaned content to the structured data dictionary
        if cleaned_content:
            structured_data[url] = cleaned_content

    return structured_data

def categorize_data(data):
    """
    Categorizes the data into predefined categories like Attractions, Dining, etc.
    
    Parameters:
        data (dict): A cleaned and structured dictionary with URLs as keys and content as values.
    
    Returns:
        dict: A categorized dictionary where keys are categories (e.g., "Attractions") 
              and values are lists of relevant content entries.
    """
    # Define categories
    categorized_data = {
        "Attractions": [],
        "Dining": [],
        "Shopping": [],
        "Transportation": [],
        "Other": []
    }

    for url, content in data.items():
        for entry in content:
            # Match content to categories based on keywords
            if re.search(r'(Attraction|Park|Activity|Tour)', entry, re.IGNORECASE):
                categorized_data["Attractions"].append(entry)
            elif re.search(r'(Dining|Restaurant|Cafe|Food)', entry, re.IGNORECASE):
                categorized_data["Dining"].append(entry)
            elif re.search(r'(Shop|Store|Retail)', entry, re.IGNORECASE):
                categorized_data["Shopping"].append(entry)
            elif re.search(r'(Transport|Airport|Flight|Taxi)', entry, re.IGNORECASE):
                categorized_data["Transportation"].append(entry)
            else:
                # Assign entries that don't match specific categories to "Other"
                categorized_data["Other"].append(entry)
    
    # Remove duplicate entries within each category
    for category in categorized_data:
        categorized_data[category] = list(set(categorized_data[category]))
    
    return categorized_data

def process_files(input_file, output_file):
    """
    Processes the input JSON file to clean, structure, and categorize data.
    
    Parameters:
        input_file (str): Path to the input JSON file.
        output_file (str): Path to the output JSON file where results will be saved.
    """
    # Load data from the input JSON file
    with open(input_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    
    # Clean and structure the data
    cleaned_data = clean_and_structure(data)
    # Categorize the cleaned data
    categorized_data = categorize_data(cleaned_data)
    
    # Save the categorized data to the output JSON file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(categorized_data, outfile, ensure_ascii=False, indent=4)

    print(f"Processed and categorized data saved to {output_file}")

# Paths to input and output JSON files for the first dataset
changi_input = r"D:\Portfolio Github\Airport_Chatbot\data\silver\processed_changi_data.json"
changi_output = r"D:\Portfolio Github\Airport_Chatbot\data\gold\final_changi_data.json"

# Paths to input and output JSON files for the second dataset
jewel_input = r"D:\Portfolio Github\Airport_Chatbot\data\silver\processed_jewel_data.json"
jewel_output = r"D:\Portfolio Github\Airport_Chatbot\data\gold\final_jewel_data.json"

# Process and categorize the first dataset (Changi)
process_files(changi_input, changi_output)

# Process and categorize the second dataset (Jewel)
process_files(jewel_input, jewel_output)


Processed and categorized data saved to D:\Portfolio Github\Airport_Chatbot\data\gold\final_changi_data.json
Processed and categorized data saved to D:\Portfolio Github\Airport_Chatbot\data\gold\final_jewel_data.json
