This notebook contains the summarized code for getting the data from all the articles that are there in the trainline help website.

We are collecting all the related urls from the website and creating content.txt having all possible questions and answers for those queries

# Get all the Links

In [34]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

In [35]:
def get_all_links(url):
    # Setup Chrome options (Visible browser)
    chrome_options = Options()
    # chrome_options.add_argument("--headless")  # Commented to keep browser visible

    # Initialize WebDriver
    service = Service()  # Ensure ChromeDriver is set up correctly
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        driver.get(url)  # Open the URL
        driver.implicitly_wait(10)  # Wait for elements to load

        # Extract all anchor (<a>) tags
        links = driver.find_elements(By.TAG_NAME, "a")

        # Get the href attribute of each link (only valid links)
        urls = [link.get_attribute("href") for link in links if link.get_attribute("href")]

        return urls
    finally:
        input("Press Enter to close the browser...")  # Keep browser open
        driver.quit()  # Close the browser session


In [36]:
# get the URls
url = "https://support.thetrainline.com/en/support/solutions/folders/78000000023"  # Replace with your target URL
all_links = get_all_links(url)


In [37]:
# Print the extracted URLs
print("\nExtracted Links:")
for link in all_links:
    print(link)

"""# Save links to a file
with open("../data/urls.txt", "w") as file:
    for link in all_links:
        href = link.get_attribute("href")
        if href:
            file.write(href + "\n")
"""


Extracted Links:
https://support.thetrainline.com/en/support/home
https://www.thetrainline.com/
javascript:;
https://support.thetrainline.com/en/support/solutions/folders/78000000023
https://support.thetrainline.com/fr/support/solutions/folders/78000000023
https://support.thetrainline.com/de/support/solutions/folders/78000000023
https://support.thetrainline.com/it/support/solutions/folders/78000000023
https://support.thetrainline.com/es/support/solutions/folders/78000000023
javascript:;
javascript:;
https://www.thetrainline.com/
https://support.thetrainline.com/en/support/solutions/78000000017
https://support.thetrainline.com/en/support/solutions/78000000019
https://support.thetrainline.com/en/support/solutions/78000000021
https://support.thetrainline.com/en/support/solutions/78000000020
https://support.thetrainline.com/en/support/solutions/78000000023
https://support.thetrainline.com/en/support/solutions/78000000018
https://support.thetrainline.com/en/support/solutions/78000000022
ht

'# Save links to a file\nwith open("../data/urls.txt", "w") as file:\n    for link in all_links:\n        href = link.get_attribute("href")\n        if href:\n            file.write(href + "\n")\n'

# Get Content from these links

In [38]:
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm  # Import tqdm for progress bar


In [39]:
# Setup WebDriver options
options = Options()
# Uncomment for headless mode
# options.add_argument("--headless")


In [40]:
# Initialize WebDriver
service = Service()
driver = webdriver.Chrome(service=service, options=options)


In [41]:
# File paths
input_file = "../data/urls.txt"
output_file = "../data/content.txt"

# Ensure data directory exists
os.makedirs("../data", exist_ok=True)


In [42]:

# Read URLs from list.txt
try:
    with open(input_file, "r", encoding="utf-8") as f:
        urls = [line.strip() for line in f if line.strip()]
except FileNotFoundError:
    print(f"⚠️ File {input_file} not found! Please create it and add URLs.")
    driver.quit()
    exit()


In [43]:
"""
# Open output file in write mode to clear previous content
with open(output_file, "w", encoding="utf-8") as file:
    file.write("Extracted Content from Trainline Support Pages\n")
    file.write("=" * 80 + "\n\n")
"""

'\n# Open output file in write mode to clear previous content\nwith open(output_file, "w", encoding="utf-8") as file:\n    file.write("Extracted Content from Trainline Support Pages\n")\n    file.write("=" * 80 + "\n\n")\n'

In [44]:


try:
    for url in tqdm(urls, desc="Processing URLs", unit="page"):  # Progress bar with tqdm
        # Open the webpage
        driver.get(url)

        # Allow the page to load
        time.sleep(3)

        # Step 1: Click "Accept All Cookies" before anything else
        try:
            cookie_button = driver.find_element(By.ID, "onetrust-accept-btn-handler")  # OneTrust "Accept All" button
            cookie_button.click()
            time.sleep(2)  # Wait for cookies to be accepted
        except NoSuchElementException:
            pass  # Ignore if cookie button is not found

        # Step 2: Remove any remaining cookie banners
        script_remove_cookie_banner = """
        let cookieBanner = document.getElementById("onetrust-banner-sdk");
        if (cookieBanner) {
            cookieBanner.remove();
        }
        """
        driver.execute_script(script_remove_cookie_banner)

        # Step 3: Open all dropdowns
        script_open_dropdowns = """
        document.querySelectorAll('.accound-content').forEach(el => {
            el.style.display = 'block';  // Keep open
            el.style.overflow = 'visible';
        });

        document.querySelectorAll('.account-heading').forEach(el => {
            el.setAttribute('aria-expanded', 'true');  // Mark as expanded
        });
        """
        driver.execute_script(script_open_dropdowns)

        # Wait before extracting content
        time.sleep(5)

        # Step 4: Extract text from the specific div with class="tab-content-layer current"
        try:
            content_element = driver.find_element(By.CLASS_NAME, "tab-content-layer.current")
            content_text = content_element.text
        except NoSuchElementException:
            content_text = "⚠️ No content found in 'tab-content-layer current'."
        
        
        print(f"URL: {url}\n")
        print("Extracted Content:\n")
        print(content_text)

        """ # Step 5: Append extracted text to the file
        with open(output_file, "a", encoding="utf-8") as file:
            file.write(f"URL: {url}\n")
            file.write("Extracted Content:\n")
            file.write(content_text + "\n")
            file.write("=" * 80 + "\n\n")"""

        # Short wait before processing the next URL
        time.sleep(3)

finally:
    # Close the browser
    driver.quit()


Processing URLs:   0%|          | 0/100 [00:00<?, ?page/s]

URL: https://support.thetrainline.com/en/support/solutions/articles/78000000553-refunding-a-uk-train-ticket

Extracted Content:

Refunding a UK train ticket
Quick refunds – for direct access to refund and changes for a particular booking, find your confirmation email and click ‘Manage booking’.  Can't find the email? Search your inbox for an email from auto-confirm@info.thetrainline.com.
If you have an account, login to view all your bookings in one place. 

The following information relates to standard refund requests if you want to cancel the ticket
Which tickets can I refund?
Flexible tickets (Off-Peak, Super Off-Peak and Anytime tickets)
These types of tickets can be refunded online for a fee. 
You have 28 days from expiry of the ticket to request a refund and return your ticket. If you have a mobile ticket and it has already been activated, you can no longer claim a refund on that journey.

Advance Single ticket
Advance Single tickets cannot be refunded but they can be changed.
Ch

Processing URLs:   0%|          | 0/100 [00:13<?, ?page/s]


KeyboardInterrupt: 