In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os

# Fetch HTML content from a given URL
def fetch_html(url):
    """
    Fetch the HTML content of a webpage.
    
    Parameters:
        url (str): The URL of the webpage to scrape.
    
    Returns:
        str: The HTML content of the page if the request is successful, otherwise None.
    """
    try:
        # Send a GET request to the URL with a User-Agent header to mimic a browser
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        # Raise an exception for HTTP errors
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        # Print error details if the request fails
        print(f"Error fetching {url}: {e}")
        return None

# Parse the HTML content using BeautifulSoup
def parse_html(html):
    """
    Parse HTML content using BeautifulSoup.
    
    Parameters:
        html (str): The HTML content to parse.
    
    Returns:
        BeautifulSoup: A BeautifulSoup object representing the parsed HTML.
    """
    return BeautifulSoup(html, 'html.parser')

# Extract relevant text content from specific HTML tags
def extract_all_text(soup):
    """
    Extract all relevant text from the website by parsing <h1>, <h2>, <h3>, <p>, and <li> tags.
    
    Parameters:
        soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML.
    
    Returns:
        list: A list of strings containing the extracted text content.
    """
    content = []

    # Extract text from heading tags (h1, h2, h3)
    for tag in ['h1', 'h2', 'h3']:
        headings = soup.find_all(tag)
        content.extend([f"{tag.upper()}: {heading.get_text().strip()}" for heading in headings])

    # Extract text from paragraph tags (p)
    paragraphs = soup.find_all('p')
    content.extend([p.get_text().strip() for p in paragraphs])

    # Extract text from list item tags (li)
    lists = soup.find_all('li')
    content.extend([li.get_text().strip() for li in lists])

    return content  # Return the extracted text content as a list

# Extract all internal links for recursive scraping
def fetch_links(soup, base_url):
    """
    Extract all internal links from a webpage for further scraping.
    
    Parameters:
        soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML.
        base_url (str): The base URL of the website for resolving relative links.
    
    Returns:
        list: A list of unique internal links.
    """
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # If the link is relative, resolve it using the base URL
        if href.startswith('/'):
            links.append(base_url + href)
        # Include links that belong to the same domain
        elif base_url in href:
            links.append(href)
    return list(set(links))  # Remove duplicate links

# Perform recursive scraping on a URL and its subpages
def scrape_recursive(url, base_url, visited=None):
    """
    Scrape the main URL and recursively scrape all linked subpages.
    
    Parameters:
        url (str): The current URL to scrape.
        base_url (str): The base URL of the website for resolving relative links.
        visited (set): A set of already visited URLs to avoid re-scraping.
    
    Returns:
        dict: A dictionary where keys are URLs and values are lists of extracted text content.
    """
    if visited is None:
        visited = set()
    
    # Check if the URL has already been scraped
    if url in visited:
        return {}  # Avoid re-scraping the same URL

    visited.add(url)  # Mark the current URL as visited
    print(f"Scraping: {url}")

    # Fetch the HTML content of the current URL
    html = fetch_html(url)
    if not html:
        return {}  # Return empty result if the HTML couldn't be fetched

    # Parse the HTML content
    soup = parse_html(html)
    # Extract text content from the page
    page_content = extract_all_text(soup)

    # Recursively scrape subpages linked from the current page
    subpage_links = fetch_links(soup, base_url)
    subpage_data = {}
    for link in subpage_links:
        subpage_data.update(scrape_recursive(link, base_url, visited))

    # Combine the current page's content with the subpage data
    return {url: page_content, **subpage_data}

# Save the scraped data to a JSON file
def save_to_json(data, file_name):
    """
    Save scraped data to a JSON file.
    
    Parameters:
        data (dict): The scraped data to save.
        file_name (str): The name of the JSON file to create.
    """
    with open(file_name, 'w', encoding='utf-8') as f:
        # Dump the data into the file with proper formatting
        json.dump(data, f, ensure_ascii=False, indent=4)


In [3]:
base_url = "https://www.changiairport.com"
output_file = r"D:\Portfolio Github\Airport_Chatbot\data\bronze\changi_data.json"
# Start scraping
data = scrape_recursive(base_url, base_url)

# Save the scraped data
save_to_json(data, output_file)
print(f"Data saved to {output_file}")

Scraping: https://www.changiairport.com
Scraping: https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory/yotelair-singapore-changi-airport.html
Scraping: https://www.changiairport.com/in/en/at-changi/special-assistance.html
Scraping: https://www.changiairport.com/in/en/at-changi/terminal-guides/terminal-3.html
Scraping: https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory.html?category=hotels
Scraping: https://www.changiairport.com/in/en/at-changi/facilities-and-services-directory.html?category=lounges
Scraping: https://www.changiairport.com/in/en/fly/flight-information/freighter/arrivals.html
Scraping: https://www.changiairport.com/in/en/dine-and-shop/shop-directory.html
Scraping: https://www.changiairport.com/in/en/experience/free-tours.html
Scraping: https://www.changiairport.com/content/changiairport/language-masters/en/airport/rewards/monarch.html?linkedListSection=faq
Error fetching https://www.changiairport.com/content/changiai

In [4]:
base_url = 'https://www.jewelchangiairport.com'
output_file = r"D:\Portfolio Github\Airport_Chatbot\data\bronze\jewel_data.json"
# Start scraping
data = scrape_recursive(base_url, base_url)

# Save the scraped data
save_to_json(data, output_file)
print(f"Data saved to {output_file}")

Scraping: https://www.jewelchangiairport.com
Scraping: https://www.jewelchangiairport.com/en/privacypolicy.html
Scraping: https://www.jewelchangiairport.com/en/attractions/hedge-maze.html
Scraping: https://www.jewelchangiairport.com/en/travellers-information.html
Scraping: https://www.jewelchangiairport.com/en/Jewel-Infinity.html
Scraping: https://www.jewelchangiairport.com/en/our-story.html
Scraping: https://www.jewelchangiairport.com/en/JewelGuidedTours.html
Scraping: https://www.jewelchangiairport.com/en/attractions/discovery-slides.html
Scraping: https://www.jewelchangiairport.com/en/itinerary-planner.html
Scraping: https://www.jewelchangiairport.com/en/promotion.html
Scraping: https://www.jewelchangiairport.com/en/media-center.html
Scraping: https://www.jewelchangiairport.com/en/playatjewel.html
Scraping: https://www.jewelchangiairport.com/en/attractions/forest-valley.html
Scraping: https://www.jewelchangiairport.com/en/attractions/walking-net.html
Scraping: https://www.jewelchang