In [20]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from fpdf import FPDF

In [21]:
LINKS_FILE = "apple_site_links.txt"
OUTPUT_DIR = "apple_product_texts"
WAIT_TIME = 5
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [22]:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=chrome_options)

In [23]:
with open(LINKS_FILE, "r") as file:
    urls = [line.strip() for line in file if line.strip()]


In [24]:
def extract_clean_text(soup):
    # Remove unwanted elements
    for tag in soup(['script', 'style', 'noscript', 'meta', 'link']):
        tag.decompose()
    
    text = []
    
    # Product title (often in <h1>)
    h1 = soup.find('h1')
    if h1:
        text.append(h1.get_text(strip=True))
    
    # Key features and descriptions
    for tag in soup.find_all(['h2', 'h3', 'p', 'li']):
        content = tag.get_text(strip=True)
        if content and len(content.split()) > 3:  
            text.append(content)
    
    return "\n".join(text)

In [25]:
def save_to_pdf(text, filename):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)

    # Handle utf-8 encoding
    text = text.encode('latin-1', 'replace').decode('latin-1')

    for line in text.splitlines():
        pdf.multi_cell(0, 10, line)

    pdf.output(filename)

In [26]:
for i, url in enumerate(urls):
    print(f"[{i+1}/{len(urls)}] Processing: {url}")
    try:
        driver.get(url)
        time.sleep(WAIT_TIME)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        clean_text = extract_clean_text(soup)

        if not clean_text:
            print("No text found, skipping.")
            continue

        # Use first line as PDF filename
        first_line = clean_text.splitlines()[0]
        safe_name = "".join(c if c.isalnum() or c in [' ', '-', '_'] else "_" for c in first_line).strip()
        safe_name = safe_name.replace(" ", "_")[:50]
        pdf_path = os.path.join(OUTPUT_DIR, f"{safe_name}.pdf")

        save_to_pdf(clean_text, pdf_path)
        print(f"Saved: {pdf_path}")

    except Exception as e:
        print(f"Failed to process {url}: {e}")

driver.quit()

[1/19] Processing: https://support.apple.com/en-in/122208
Saved: apple_product_texts/iPhone_16e_-_Tech_Specs.pdf
[2/19] Processing: https://support.apple.com/en-in/121030
Saved: apple_product_texts/iPhone_16_Plus_-_Tech_Specs.pdf
[3/19] Processing: https://support.apple.com/en-in/121032
Saved: apple_product_texts/iPhone_16_Pro_Max_-_Tech_Specs.pdf
[4/19] Processing: https://support.apple.com/en-in/111829
Saved: apple_product_texts/iPhone_15_Pro_-_Technical_Specifications.pdf
[5/19] Processing: https://support.apple.com/en-in/111830
Saved: apple_product_texts/iPhone_15_Plus_-_Tech_Specs.pdf
[6/19] Processing: https://support.apple.com/en-in/111850
Saved: apple_product_texts/iPhone_14_-_Tech_Specs.pdf
[7/19] Processing: https://support.apple.com/en-in/121029
Saved: apple_product_texts/iPhone_16_-_Tech_Specs.pdf
[8/19] Processing: https://support.apple.com/en-in/121031
Saved: apple_product_texts/iPhone_16_Pro_-_Tech_Specs.pdf
[9/19] Processing: https://support.apple.com/en-in/111831
Saved