In [3]:
import json
import re

def clean_text(text_list):
    """
    Cleans the text by removing empty strings, repetitive phrases, and irrelevant content.
    
    Parameters:
        text_list (list): A list of strings containing text extracted from web pages.
    
    Returns:
        list: A cleaned list of strings with irrelevant content removed and duplicates handled.
    """
    cleaned = []
    for text in text_list:
        # Skip text that contains navigation phrases, copyright notices, or irrelevant content
        if re.search(r'(Conditions of Use|Privacy Policy|©|Back|Download App|Sign Up)', text, re.IGNORECASE):
            continue
        # Skip empty strings or strings that are too short to be meaningful
        if not text.strip() or len(text.strip()) < 3:
            continue
        # Add the text to the cleaned list only if it's not already included
        if text not in cleaned:
            cleaned.append(text.strip())
    return cleaned

def process_json(input_file, output_file):
    """
    Processes an input JSON file to clean and organize the content.
    
    Parameters:
        input_file (str): Path to the input JSON file containing scraped data.
        output_file (str): Path to the output JSON file where cleaned data will be saved.
    """
    # Open and read the input JSON file
    with open(input_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    
    cleaned_data = {}
    
    # Iterate over each URL and its associated content
    for url, content in data.items():
        # Clean the content for the current URL
        cleaned_content = clean_text(content)
        # Add the cleaned content to the dictionary only if it's not empty
        if cleaned_content:
            cleaned_data[url] = cleaned_content
    
    # Write the cleaned data to the output JSON file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, ensure_ascii=False, indent=4)

    print(f"Processed data saved to {output_file}")

# Paths to the input and output JSON files for the first dataset
changi_input = r"D:\Portfolio Github\Airport_Chatbot\data\bronze\changi_data.json"
changi_output = r"D:\Portfolio Github\Airport_Chatbot\data\silver\processed_changi_data.json"

# Paths to the input and output JSON files for the second dataset
jewel_input = r"D:\Portfolio Github\Airport_Chatbot\data\bronze\jewel_data.json"
jewel_output = r"D:\Portfolio Github\Airport_Chatbot\data\silver\processed_jewel_data.json"

# Process the first JSON file (Changi data)
process_json(changi_input, changi_output)

# Process the second JSON file (Jewel data)
process_json(jewel_input, jewel_output)


Processed data saved to D:\Portfolio Github\Airport_Chatbot\data\silver\processed_changi_data.json
Processed data saved to D:\Portfolio Github\Airport_Chatbot\data\silver\processed_jewel_data.json
