In [2]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET # For parsing sitemap.xml
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
import time

### Step 1: Setting Up and Fetching Initial Content (Main /about page or Sitemap)

In [11]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET # For parsing sitemap.xml

# --- Configuration ---
ABOUT_BASE_URL = "https://www.verizon.com/about/"
SITEMAP_URL = "https://www.verizon.com/about/sitemap.xml" # From robots.txt
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 (DissertationResearchBot/0.1; [Your BITS ID or Project Name])'
}
REQUEST_DELAY = 2  # Seconds between requests

# --- Helper Function to Fetch Content ---
def fetch_url_content(url):
    """Fetches content from a URL with error handling."""
    print(f"Fetching: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
    finally:
        time.sleep(REQUEST_DELAY) # Respectful delay after every request

# --- 1a. Parse Sitemap ---
def get_urls_from_sitemap(sitemap_url):
    """Parses an XML sitemap and extracts URLs."""
    urls = []
    response = fetch_url_content(sitemap_url)
    if response and response.content:
        try:
            root = ET.fromstring(response.content)
            # XML namespace, often present in sitemaps
            namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            for url_element in root.findall('sitemap:url', namespace):
                loc_element = url_element.find('sitemap:loc', namespace)
                if loc_element is not None and loc_element.text:
                    urls.append(loc_element.text)
        except ET.ParseError as e:
            print(f"Error parsing sitemap XML: {e}")
    return urls

# Get URLs from the /about sitemap first
about_page_urls = get_urls_from_sitemap(SITEMAP_URL)
if not about_page_urls:
    print(f"Could not fetch URLs from sitemap, trying base /about/ page: {ABOUT_BASE_URL}")
    # Fallback to just the base /about page if sitemap fails or is empty
    about_page_urls = [ABOUT_BASE_URL]

print(f"Found {len(about_page_urls)} URLs from sitemap (or using base URL).")
# You might want to print a few to check:
# for i, url in enumerate(about_page_urls):
#     if i < 5: print(url)
#     else: break

Fetching: https://www.verizon.com/about/sitemap.xml
Could not fetch URLs from sitemap, trying base /about/ page: https://www.verizon.com/about/
Found 1 URLs from sitemap (or using base URL).


### Step 2: Iterating Through URLs, Parsing HTML, and Extracting Information

In [12]:
# --- Data Storage (simple example) ---
extracted_data = [] # List to store dictionaries of extracted info
visited_urls = set()
urls_to_process = list(about_page_urls) # Start with URLs from sitemap

# --- Main Loop (Simplified - you'll need to manage the queue more robustly for deep crawling) ---
# For this example, we'll just process the URLs obtained from the sitemap.
# A full recursive crawler would add new found links back to urls_to_process.

for current_url in urls_to_process:
    if current_url in visited_urls:
        continue
    
    visited_urls.add(current_url)
    response = fetch_url_content(current_url)

    if response and response.headers.get('content-type','').startswith('text/html'):
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # --- 2a. Extracting Textual Content (Example) ---
        # This is highly dependent on the specific HTML structure of Verizon's pages.
        # You'll need to inspect the pages using browser developer tools to find the right tags and classes.
        title_tag = soup.find('title')
        title = title_tag.string.strip() if title_tag else 'No Title Found'
        
        page_text_elements = []
        # Example: Try to get text from main content areas, articles, or specific divs
        # This is a GUESS - you MUST inspect the actual HTML
        for tag_name in ['article', 'main', 'div']: # Add more relevant tags
            # You might look for specific class names known to contain main content
            # e.g., soup.find_all(tag_name, class_='content-body')
            elements = soup.find_all(tag_name) 
            for element in elements:
                 # Basic filtering to avoid script/style if not handled by find_all
                if element.name not in ['script', 'style']:
                    page_text_elements.append(element.get_text(separator=' ', strip=True))
        
        page_text = "\n".join(page_text_elements)
        # Further cleaning of page_text might be necessary (remove excessive whitespace, etc.)

        print(f"--- Content from: {current_url} ---")
        print(f"Title: {title}")
        # print(f"Extracted Text (snippet): {page_text[:500]}...") # Print a snippet

        extracted_data.append({
            'url': current_url,
            'title': title,
            'text_content': page_text # Store the full text
        })

        # --- 2b. Extracting Links to Other /about Pages (for deeper crawling, if permitted and desired) ---
        # if current_url.startswith(ABOUT_BASE_URL): # Only find new links if we are in the /about section
        #     for a_tag in soup.find_all('a', href=True):
        #         link = a_tag['href']
        #         abs_link = urljoin(current_url, link) # Resolve relative URLs
        #         parsed_link = urlparse(abs_link)
                
        #         # Filter to stay within /about and avoid already visited/queued
        #         if parsed_link.netloc == urlparse(ABOUT_BASE_URL).netloc and \
        #            parsed_link.path.startswith(urlparse(ABOUT_BASE_URL).path) and \
        #            abs_link not in visited_urls and \
        #            abs_link not in urls_to_process: # A proper queue would handle this better
        #             # print(f"Found relevant new link: {abs_link}")
        #             # urls_to_process.append(abs_link) # Add to a proper queue for deep crawl
        #             pass # For now, we are only processing sitemap URLs directly

    elif response and response.headers.get('content-type','') == 'application/pdf':
        print(f"--- PDF Found: {current_url} ---")
        # Handle PDF processing (see Step 3)
        # For now, just record it
        extracted_data.append({
            'url': current_url,
            'title': current_url.split('/')[-1], # Use filename as title
            'type': 'pdf',
            'text_content': None # Placeholder for PDF text
        })
        
# --- Now extracted_data list contains dictionaries with info from HTML pages and placeholders for PDFs
# Example: Save to a JSON file
# import json
# with open('verizon_about_data.json', 'w', encoding='utf-8') as f:
#     json.dump(extracted_data, f, ensure_ascii=False, indent=4)

Fetching: https://www.verizon.com/about/
--- Content from: https://www.verizon.com/about/ ---
Title: Official Verizon Corporate Web site   About Verizon


### Step 3: Handling PDFs (Conceptual with LLMSherpa)  
The robots.txt allowed some PDFs in /about/files/ and /about/file/. LLMSherpa is good for complex PDFs, especially with tables.

In [13]:
# Conceptual LLMSherpa usage (you'll need to install it and potentially set up an API key if using their cloud API)
# from llmsherpa.readers import LayoutPDFReader # If using the local/open-source part
# from llmsherpa.clients import OpenAIClient # Or your preferred LLM client for some features

# SHERPA_API_URL = "YOUR_SHERPA_API_URL_IF_USING_CLOUD_SERVICE" # Or use local reader

def process_pdf_with_llmsherpa(pdf_url_or_path):
    """
    Processes a PDF using LLMSherpa to extract text and potentially structured data.
    This is a conceptual example. Actual usage may vary based on LLMSherpa version and setup.
    """
    print(f"Processing PDF with LLMSherpa: {pdf_url_or_path}")
    # Using LayoutPDFReader for local processing or if you download the PDF first
    # If pdf_url_or_path is a URL, you'd typically download it first:
    # pdf_response = requests.get(pdf_url_or_path, stream=True)
    # with open("temp.pdf", "wb") as f:
    #     for chunk in pdf_response.iter_content(chunk_size=8192):
    #         f.write(chunk)
    # local_pdf_path = "temp.pdf"
    #
    # layout_reader = LayoutPDFReader(SHERPA_API_URL) # For cloud API
    # try:
    #     doc = layout_reader.read_pdf(local_pdf_path if using local path else pdf_url_or_path)
    #     full_text = ""
    #     for chunk in doc.chunks():
    #         full_text += chunk.to_text() + "\n\n"
    #     # Tables can also be extracted if needed:
    #     # tables = [table.to_df() for table in doc.tables()]
    #     return full_text #, tables
    # except Exception as e:
    #     print(f"Error processing PDF with LLMSherpa {pdf_url_or_path}: {e}")
    #     return None
    
    # Placeholder for actual LLMSherpa processing logic
    # This is a simplified placeholder as direct URL processing depends on the specific LLMSherpa setup.
    # Typically, you'd download the PDF first, then pass the local path to LLMSherpa.
    try:
        pdf_response = fetch_url_content(pdf_url_or_path)
        if pdf_response:
            # For now, let's just say we would process it.
            # Actual text extraction with LLMSherpa requires more setup.
            print(f"LLMSherpa would process this PDF's content from URL: {pdf_url_or_path}")
            # In a real scenario, you'd integrate the LLMSherpa call here
            # and return the extracted text.
            # For this example, we'll return a placeholder.
            return f"Text from PDF: {pdf_url_or_path} would be extracted here by LLMSherpa."
        return "Could not download PDF for LLMSherpa processing."
    except Exception as e:
        print(f"Error in placeholder PDF processing for {pdf_url_or_path}: {e}")
        return None


# Example of how you might integrate PDF processing into your loop:
# (This would replace the simple 'application/pdf' handling in Step 2)

# for item in extracted_data:
#     if item.get('type') == 'pdf' and item.get('text_content') is None:
#         pdf_text = process_pdf_with_llmsherpa(item['url'])
#         if pdf_text:
#             item['text_content'] = pdf_text
#             print(f"Successfully processed PDF: {item['url']}")
#         else:
#             print(f"Failed to process PDF: {item['url']}")

In [14]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse
import os # For path manipulation and creating directories
import re # For sanitizing filenames

# --- Configuration ---
ABOUT_BASE_URL = "https://www.verizon.com/about/"
SITEMAP_URL = "https://www.verizon.com/about/sitemap.xml"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 (DissertationResearchBot/0.1; [Your BITS ID or Project Name])'
}
REQUEST_DELAY = 2  # Seconds between requests
PDF_DOWNLOAD_DIR = "downloaded_verizon_about_pdfs" # Local directory to save PDFs

# --- Create download directory if it doesn't exist ---
if not os.path.exists(PDF_DOWNLOAD_DIR):
    os.makedirs(PDF_DOWNLOAD_DIR)

# --- Helper Function to Fetch Content/Download ---
def fetch_url_content(url, stream=False): # Added stream parameter
    """Fetches content from a URL with error handling."""
    print(f"Fetching: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=20, stream=stream) # Increased timeout for potential downloads
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
    finally:
        if not stream: # Only delay if not streaming a download right after
            time.sleep(REQUEST_DELAY)

# --- Helper Function to Sanitize Filename ---
def sanitize_filename(filename):
    """Sanitizes a string to be used as a filename."""
    # Remove or replace invalid characters
    filename = re.sub(r'[\\/*?:"<>|]',"", filename)
    # Truncate if too long (optional, adjust length)
    if len(filename) > 200:
        filename = filename[:200]
    return filename

# --- Helper Function to Download and Save PDF ---
def download_pdf(pdf_url):
    """Downloads a PDF from a URL and saves it locally."""
    response = fetch_url_content(pdf_url, stream=True)
    if response:
        # Try to get a reasonable filename
        content_disposition = response.headers.get('content-disposition')
        if content_disposition:
            fname_match = re.search(r'filename="?([^"]+)"?', content_disposition)
            if fname_match:
                filename = fname_match.group(1)
            else:
                filename = pdf_url.split('/')[-1]
        else:
            filename = pdf_url.split('/')[-1]
        
        if not filename.lower().endswith(".pdf"):
            filename += ".pdf" # Ensure it has a .pdf extension

        sanitized_name = sanitize_filename(filename)
        local_pdf_path = os.path.join(PDF_DOWNLOAD_DIR, sanitized_name)

        try:
            with open(local_pdf_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"Successfully downloaded: {pdf_url} to {local_pdf_path}")
            return local_pdf_path
        except IOError as e:
            print(f"Error saving PDF {local_pdf_path}: {e}")
            return None
    return None


# --- Parse Sitemap (from previous example, slightly adapted) ---
def get_urls_from_sitemap(sitemap_url):
    urls = []
    response = fetch_url_content(sitemap_url) # stream=False by default here
    if response and response.content:
        try:
            root = ET.fromstring(response.content)
            namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            for url_element in root.findall('sitemap:url', namespace):
                loc_element = url_element.find('sitemap:loc', namespace)
                if loc_element is not None and loc_element.text:
                    urls.append(loc_element.text)
        except ET.ParseError as e:
            print(f"Error parsing sitemap XML: {e}")
        finally:
            time.sleep(REQUEST_DELAY) # Add delay after sitemap processing too
    return urls

# --- Main Crawling Logic (Modified) ---
extracted_data_log = [] # Log of processed URLs and outcome
visited_urls = set()

# Start with URLs from the sitemap for the /about section
# import xml.etree.ElementTree as ET # Ensure this is imported
about_page_urls = get_urls_from_sitemap(SITEMAP_URL)
if not about_page_urls:
    print(f"Could not fetch URLs from sitemap, consider checking manually or trying the base /about/ page: {ABOUT_BASE_URL}")
    # If sitemap fails, you could add ABOUT_BASE_URL to a queue and crawl from there
    # For now, this example will stop if sitemap fails to provide URLs.
    
urls_to_process_queue = list(about_page_urls) # Use a list as a simple queue

while urls_to_process_queue:
    current_url = urls_to_process_queue.pop(0) # FIFO

    if current_url in visited_urls:
        continue
    
    visited_urls.add(current_url)
    
    # Check if URL likely points to a PDF by extension first (quicker)
    is_pdf_link = current_url.lower().endswith(".pdf")
    
    response = None
    if is_pdf_link:
        # Directly attempt download
        local_path = download_pdf(current_url)
        if local_path:
            extracted_data_log.append({
                'url': current_url,
                'type': 'pdf',
                'status': 'downloaded',
                'local_path': local_path
            })
        else:
            extracted_data_log.append({
                'url': current_url,
                'type': 'pdf',
                'status': 'download_failed'
            })
    else:
        # Fetch HTML page
        response = fetch_url_content(current_url) # stream=False by default
        if response:
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' in content_type:
                html_content = response.text
                soup = BeautifulSoup(html_content, 'html.parser')

                title_tag = soup.find('title')
                title = title_tag.string.strip() if title_tag else 'No Title Found'
                
                # Simplified text extraction for this example
                body_text = soup.body.get_text(separator=' ', strip=True) if soup.body else ""

                print(f"--- Scraped HTML: {current_url} ---")
                print(f"Title: {title}")

                extracted_data_log.append({
                    'url': current_url,
                    'title': title,
                    'type': 'html',
                    'status': 'scraped',
                    # 'text_content': body_text # You might store a snippet or save full HTML to a file
                })

                # Find and add new relevant links (PDFs or other /about pages) to the queue
                # This part needs to be robust and respect the scope of your permission
                if current_url.startswith(ABOUT_BASE_URL) or urlparse(current_url).path.startswith("/about/"):
                    for a_tag in soup.find_all('a', href=True):
                        link = a_tag['href']
                        abs_link = urljoin(current_url, link)
                        parsed_link = urlparse(abs_link)
                        
                        # Filter to stay within /about, avoid already processed, and check if it's a resource you want
                        if parsed_link.netloc == urlparse(ABOUT_BASE_URL).netloc and \
                           parsed_link.path.startswith(urlparse(ABOUT_BASE_URL).path) and \
                           abs_link not in visited_urls and \
                           abs_link not in urls_to_process_queue:
                            
                            if abs_link.lower().endswith(".pdf") or "text/html" in requests.head(abs_link, headers=HEADERS, timeout=5).headers.get('content-type','').lower(): # Basic check
                                print(f"Adding to queue: {abs_link}")
                                urls_to_process_queue.append(abs_link)
                                time.sleep(0.1) # Small delay when adding many links
            
            elif 'application/pdf' in content_type: # If content-type reveals it's a PDF after fetching
                print(f"--- PDF Found (by content-type): {current_url} ---")
                local_path = download_pdf(current_url) # Re-fetch with stream for download or handle response object better
                if local_path:
                     extracted_data_log.append({
                        'url': current_url,
                        'type': 'pdf',
                        'status': 'downloaded',
                        'local_path': local_path
                    })
                else:
                    extracted_data_log.append({
                        'url': current_url,
                        'type': 'pdf',
                        'status': 'download_failed'
                    })
            else:
                print(f"Skipping non-HTML/PDF content type: {content_type} for URL: {current_url}")
                extracted_data_log.append({
                    'url': current_url,
                    'type': 'unknown',
                    'status': 'skipped_content_type'
                })

# --- Save the log ---
# import json
# with open('verizon_about_crawl_log.json', 'w', encoding='utf-8') as f:
#     json.dump(extracted_data_log, f, ensure_ascii=False, indent=4)

print("Crawling and downloading (if PDF) process complete.")
print(f"Log of actions saved in 'extracted_data_log' list (consider saving to file).")
print(f"PDFs downloaded to '{PDF_DOWNLOAD_DIR}' directory.")

Fetching: https://www.verizon.com/about/sitemap.xml
Could not fetch URLs from sitemap, consider checking manually or trying the base /about/ page: https://www.verizon.com/about/
Crawling and downloading (if PDF) process complete.
Log of actions saved in 'extracted_data_log' list (consider saving to file).
PDFs downloaded to 'downloaded_verizon_about_pdfs' directory.


## Testing Beautiful Soup

In [15]:
from bs4 import BeautifulSoup
import requests

In [16]:
url = "https://www.verizon.com/about/our-company"
page = requests.get(url)

In [17]:
soup = BeautifulSoup(page.content, 'html.parser')

In [18]:
print(soup.prettify())  # Print the HTML content in a readable format

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta content="width=device-width" name="viewport"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script>
   const SYNC_GRP = '3e1db8d7e927e65ebdea683143ab1db4511e86f8dfad7916b7808217b30683af';		const URL_PARMS = (window.UDNS_PARMS = new URLSearchParams(          window.location.search,        ));		const THROTTLE_URL_PARM = URL_PARMS.get('UDNS_THROTTLE');		const TC_CONSENT_STRING = localStorage.getItem('tcmMPConsent');	    const TC_CONSENT_OBJECT = TC_CONSENT_STRING		  ? JSON.parse(TC_CONSENT_STRING)		  : undefined;		console.log('Throttle Parm:'+THROTTLE_URL_PARM);		if (THROTTLE_URL_PARM) {	      window.UDNS_THROTTLE = THROTTLE_URL_PARM === 'true' ? 'true' : 'false';		} else {			window.UDNS_THROTTLE = (TC_CONSENT_OBJECT?.[SYNC_GRP]?.metadata?.UDNS_THROTTLE === 'true'? 'true' : 'false');		}		console.log('Throttle:'+window.UDNS_THROTTLE);		const useRegulation = (window.UDNS_THROTTLE === 'true');		self.air

## Testing `Selenium` for web crawling/scraping

In [3]:
# --- Helper Function to Fetch Content ---
def fetch_url_content(url):
    """Fetches content from a URL with error handling."""
    print(f"Fetching: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
    finally:
        time.sleep(REQUEST_DELAY) # Respectful delay after every request

# --- 1a. Parse Sitemap ---
def get_urls_from_sitemap(sitemap_url):
    """Parses an XML sitemap and extracts URLs."""
    urls = []
    response = fetch_url_content(sitemap_url)
    if response and response.content:
        try:
            root = ET.fromstring(response.content)
            # XML namespace, often present in sitemaps
            namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            for url_element in root.findall('sitemap:url', namespace):
                loc_element = url_element.find('sitemap:loc', namespace)
                if loc_element is not None and loc_element.text:
                    urls.append(loc_element.text)
        except ET.ParseError as e:
            print(f"Error parsing sitemap XML: {e}")
    return urls

In [19]:
EDGE_DRIVER_PATH = "E:/Programms(installed)/EdgeWebDriver/msedgedriver.exe"


In [4]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
import time

In [21]:
# Set up the Edge driver
service = Service(EDGE_DRIVER_PATH)
driver = webdriver.Edge(service=service)

In [22]:
try:
    # Open the target page
    driver.get("https://www.verizon.com/about/")
    time.sleep(3)  # Wait for the page to load (increase if needed)

    # Get page source
    html = driver.page_source
    print(html[:1000])  # Print first 1000 characters

    # Example: Find all links on the page
    links = driver.find_elements(By.TAG_NAME, "a")
    for link in links[:10]:  # Print first 10 links
        print(link.get_attribute("href"))

finally:
    driver.quit()

<html lang="en"><head><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="viewport" content="width=device-width"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><script async="" src="/about/js/externaljs/surveys.js"></script><script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=UA-126391587-1&amp;cx=c&amp;gtm=45je56u0v9136822931za200&amp;tag_exp=101509157~103116026~103200004~103233427~103308216~103308218~103351869~103351871~104684208~104684211~104718208~104784387~104784389~104839054~104839056~104885889~104885891~104908318~104908320"></script><script>const SYNC_GRP = '3e1db8d7e927e65ebdea683143ab1db4511e86f8dfad7916b7808217b30683af';		const URL_PARMS = (window.UDNS_PARMS = new URLSearchParams(          window.location.search,        ));		const THROTTLE_URL_PARM = URL_PARMS.get('UDNS_THROTTLE');		const TC_CONSENT_S

In [4]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
import requests # For sitemap initially


import requests
import time
import os
import re

In [24]:
# --- Configuration ---
SITEMAP_URL = "https://www.verizon.com/about/sitemap.xml"
BASE_DOMAIN = "www.verizon.com" # To help filter links
ABOUT_PATH_PREFIX = "/about/"
HEADERS = { # For requests
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 (DissertationResearchBot/0.1)'
}

EDGE_DRIVER_PATH = "E:/Programms(installed)/EdgeWebDriver/msedgedriver.exe"
REQUEST_DELAY = 3  # Seconds between page loads/requests
SELENIUM_TIMEOUT = 20 # Seconds to wait for elements



In [25]:

# Set up the Edge driver
service = Service(EDGE_DRIVER_PATH)
driver = webdriver.Edge(service=service)

Saving the `verizon.com/robots.txt` to refer to when doing web crawling

In [26]:
import requests
import os

robots_url = "https://www.verizon.com/robots.txt"
response = requests.get(robots_url)
current_file_root = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
print("Root of the current file:", current_file_root)

robots_download_dir = os.path.join(current_file_root, "Website Downloads", "Verizon")
# Create directory if it doesn't exist
# os.makedirs(robots_download_dir, exist_ok=True)
# Save the robots.txt content

robots_txt_path = os.path.join(robots_download_dir, "verizon_robots.txt")
if response.status_code == 200:
    with open(robots_txt_path, "w", encoding="utf-8") as f:
        f.write(response.text)
    print("robots.txt saved as verizon_robots.txt")
else:
    print(f"Failed to fetch robots.txt (status code: {response.status_code})")

Root of the current file: i:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project
robots.txt saved as verizon_robots.txt


Step 1: Fetch URLs from Sitemap (as previously discussed)

In [27]:
# def fetch_url_content_requests(url):
#     print(f"Fetching (requests): {url}")
#     try:
#         response = requests.get(url, headers=HEADERS, timeout=10)
#         response.raise_for_status()
#         return response
#     except requests.exceptions.RequestException as e:
#         print(f"Error fetching {url} with requests: {e}")
#         return None
#     finally:
#         time.sleep(REQUEST_DELAY)

# def get_urls_from_sitemap(sitemap_url):
#     urls = []
#     response = fetch_url_content_requests(sitemap_url)
#     if response and response.content:
#         try:
#             root = ET.fromstring(response.content)
#             namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
#             for url_element in root.findall('sitemap:url', namespace):
#                 loc_element = url_element.find('sitemap:loc', namespace)
#                 if loc_element is not None and loc_element.text:
#                     urls.append(loc_element.text)
#         except ET.ParseError as e:
#             print(f"Error parsing sitemap XML: {e}")
#     return urls



In [28]:

target_urls = get_urls_from_sitemap(SITEMAP_URL)
if not target_urls:
    print("No URLs fetched from sitemap. Exiting or add alternative URL sources.")
    # exit() # Or handle differently
else:
    print(f"Found {len(target_urls)} URLs from sitemap.")

Fetching: https://www.verizon.com/about/sitemap.xml
No URLs fetched from sitemap. Exiting or add alternative URL sources.


Step 2: Setup Selenium WebDriver

In [None]:
def setup_driver():
    service = Service(executable_path=EDGE_DRIVER_PATH)
    options = webdriver.ChromeOptions()
    # options.add_argument('--headless')  # Run in headless mode (no browser UI) - recommended for scraping
    # options.add_argument('--disable-gpu')
    options.add_argument(f'user-agent={HEADERS["User-Agent"]}')
    driver = webdriver.Edge(service=service, options=options)
    return driver


Step 3: Fetch and Parse Individual Pages

In [30]:
def fetch_page_with_selenium(driver, url):
    print(f"Fetching (Selenium): {url}")
    try:
        driver.get(url)
        # Wait for a specific element that indicates page load, or a general delay
        # Example: Wait for the body tag to be present
        WebDriverWait(driver, SELENIUM_TIMEOUT).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(REQUEST_DELAY) # Additional delay for dynamic content to settle
        return driver.page_source
    except Exception as e:
        print(f"Error fetching {url} with Selenium: {e}")
        return None

def parse_content_with_beautifulsoup(html_content, url):
    """
    Parses HTML content to extract title, main text, and relevant links.
    This function will require SIGNIFICANT CUSTOMIZATION based on verizon.com/about structure.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    page_data = {'url': url, 'title': '', 'main_text': '', 'pdf_links': [], 'related_about_links': []}

    # --- Extract Title ---
    title_tag = soup.find('title')
    page_data['title'] = title_tag.string.strip() if title_tag else 'No Title Found'

    # --- Extract Main Text (HIGHLY SITE-SPECIFIC) ---
    # You MUST inspect verizon.com/about pages to find reliable containers for main content.
    # Examples of what you might look for:
    # main_content_area = soup.find('main')
    # main_content_area = soup.find('article')
    # main_content_area = soup.find('div', id='content')
    # main_content_area = soup.find('div', class_='main-article-body') # Example class

    # For this example, let's try a few common tags.
    # Prioritize more specific tags/classes/ids if you find them.
    main_text_parts = []
    potential_containers = []
    if soup.find('article'):
        potential_containers.extend(soup.find_all('article'))
    elif soup.find('main'):
        potential_containers.extend(soup.find_all('main'))
    else:
        # A very generic fallback - likely to be noisy
        potential_containers.extend(soup.find_all('div')) # You'd need to filter this heavily by class/id

    for container in potential_containers:
        # Add checks here to ensure it's likely a main content block
        # e.g., if 'nav' in container.get('class', []) or container.find('nav'): continue
        text = container.get_text(separator=' ', strip=True)
        if len(text) > 200: # Arbitrary threshold to guess it's a main content block
            main_text_parts.append(text)
    
    page_data['main_text'] = "\n".join(main_text_parts) if main_text_parts else "Main text not reliably extracted."


    # --- Extract Links (PDFs and related /about pages) ---
    for a_tag in soup.find_all('a', href=True):
        link_href = a_tag['href']
        abs_link = urljoin(url, link_href) # Resolve relative URLs
        parsed_link = urlparse(abs_link)

        # Check if it's a PDF link from verizon.com
        if parsed_link.netloc == BASE_DOMAIN and link_href.lower().endswith('.pdf'):
            if abs_link not in page_data['pdf_links']:
                page_data['pdf_links'].append(abs_link)
        
        # Check if it's a related /about page (within verizon.com/about/)
        # and not an external link or different section (unless permission allows)
        elif parsed_link.netloc == BASE_DOMAIN and parsed_link.path.startswith(ABOUT_PATH_PREFIX):
            if abs_link not in page_data['related_about_links']:
                 page_data['related_about_links'].append(abs_link)
                 
    return page_data

# --- Main Processing Loop ---
# driver = setup_driver() # Initialize the driver once
all_extracted_content = []
# visited_pages = set() # To manage crawled pages if you implement recursive crawling

# for url_to_scrape in target_urls: # target_urls from sitemap
#     if url_to_scrape in visited_pages:
#         continue
    
#     print(f"\nProcessing: {url_to_scrape}")
#     html = fetch_page_with_selenium(driver, url_to_scrape)
#     visited_pages.add(url_to_scrape)

#     if html:
#         data = parse_content_with_beautifulsoup(html, url_to_scrape)
#         all_extracted_content.append(data)
#         print(f"  Title: {data['title']}")
#         print(f"  Main Text Snippet: {data['main_text'][:200]}...")
#         print(f"  Found {len(data['pdf_links'])} PDF links.")
#         print(f"  Found {len(data['related_about_links'])} related /about links.")
        
        # If you were to crawl recursively (respecting scope and permission):
        # for new_link in data['related_about_links']:
        #     if new_link not in visited_pages and new_link not in urls_to_add_to_queue:
        #         # add new_link to your crawling queue
        #         pass

# driver.quit() # Close the browser when done

# --- Example: Save extracted content log ---
# import json
# with open('verizon_about_scraped_content.json', 'w', encoding='utf-8') as f:
#    json.dump(all_extracted_content, f, indent=4, ensure_ascii=False)
# print(f"\nScraped content overview saved to verizon_about_scraped_content.json")

### New Web-Page fetching

In [58]:


# --- Configuration (mostly same as before) ---
SITEMAP_INDEX_URL = "https://www.verizon.com/about/sitemap.xml" # This is now a sitemap index
BASE_DOMAIN = "www.verizon.com"
ABOUT_PATH_PREFIX = "/about/"
HEADERS = { # For requests
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 (DissertationResearchBot/0.1)'
}
REQUEST_DELAY = 2  # Seconds between requests
PDF_DOWNLOAD_DIR = "downloaded_verizon_about_pdfs"

if not os.path.exists(PDF_DOWNLOAD_DIR):
    os.makedirs(PDF_DOWNLOAD_DIR)


In [None]:

# --- Helper Function to Fetch Content (same as before) ---
def fetch_url_content_requests(url, stream=False):
    """Fetches content from a URL with error handling."""
    print(f"Fetching: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=15, stream=stream)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url} with requests: {e}")
        return None
    finally:
        if not stream:
            time.sleep(REQUEST_DELAY)

# --- Updated Sitemap Parsing Logic ---
def get_all_page_urls_from_sitemap_index(sitemap_index_url):
    """
    Parses a sitemap index file, then parses all nested sitemaps
    to return a comprehensive list of page URLs.
    """
    all_page_urls = set() # Use a set to avoid duplicates
    
    print(f"Fetching sitemap index: {sitemap_index_url}")
    index_response = fetch_url_content_requests(sitemap_index_url)

    if not (index_response and index_response.content):
        print(f"Failed to fetch or empty content for sitemap index: {sitemap_index_url}")
        return list(all_page_urls)

    try:
        index_root = ET.fromstring(index_response.content)
        # Namespace is important for sitemap parsing
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

        # Check if it's a sitemap index (contains <sitemap> tags)
        # or a direct urlset (contains <url> tags)
        if index_root.tag == f"{{{namespace['sitemap']}}}sitemapindex":
            print(f"Parsing sitemap index: {sitemap_index_url}")
            nested_sitemap_locs = [
                s_loc.text
                for sitemap_tag in index_root.findall('sitemap:sitemap', namespace)
                if (s_loc := sitemap_tag.find('sitemap:loc', namespace)) is not None and s_loc.text
            ]

            for nested_sitemap_url in nested_sitemap_locs:
                print(f"  Fetching nested sitemap: {nested_sitemap_url}")
                sitemap_response = fetch_url_content_requests(nested_sitemap_url)
                if sitemap_response and sitemap_response.content:
                    try:
                        sitemap_root = ET.fromstring(sitemap_response.content)
                        # Nested sitemaps should be <urlset>
                        if sitemap_root.tag == f"{{{namespace['sitemap']}}}urlset":
                            for url_element in sitemap_root.findall('sitemap:url', namespace):
                                page_loc_element = url_element.find('sitemap:loc', namespace)
                                if page_loc_element is not None and page_loc_element.text:
                                    all_page_urls.add(page_loc_element.text)
                        else:
                            print(f"    Warning: Nested sitemap {nested_sitemap_url} is not a <urlset>. Root tag: {sitemap_root.tag}")
                    except ET.ParseError as e:
                        print(f"    Error parsing nested sitemap XML {nested_sitemap_url}: {e}")
        
        elif index_root.tag == f"{{{namespace['sitemap']}}}urlset": # If the initial URL was already a page sitemap
            print(f"Parsing as a direct page sitemap: {sitemap_index_url}")
            for url_element in index_root.findall('sitemap:url', namespace):
                page_loc_element = url_element.find('sitemap:loc', namespace)
                if page_loc_element is not None and page_loc_element.text:
                    all_page_urls.add(page_loc_element.text)
        else:
            print(f"Unknown root tag in sitemap file {sitemap_index_url}: {index_root.tag}")

    except ET.ParseError as e:
        print(f"Error parsing sitemap index XML {sitemap_index_url}: {e}")
            
    return list(all_page_urls)

# --- Get all URLs from the /about sitemap index ---
target_page_urls = get_all_page_urls_from_sitemap_index(SITEMAP_INDEX_URL)

if target_page_urls:
    print(f"\nSuccessfully extracted {len(target_page_urls)} unique page URLs from all nested sitemaps.")

else:
    print("No page URLs extracted. Please check the sitemap structure or network access.")



Fetching sitemap index: https://www.verizon.com/about/sitemap.xml
Fetching: https://www.verizon.com/about/sitemap.xml
Parsing sitemap index: https://www.verizon.com/about/sitemap.xml
  Fetching nested sitemap: https://www.verizon.com/about/sitemap-our-company.xml
Fetching: https://www.verizon.com/about/sitemap-our-company.xml
  Fetching nested sitemap: https://www.verizon.com/about/sitemap-responsibility.xml
Fetching: https://www.verizon.com/about/sitemap-responsibility.xml
  Fetching nested sitemap: https://www.verizon.com/about/sitemap-investors.xml
Fetching: https://www.verizon.com/about/sitemap-investors.xml
  Fetching nested sitemap: https://www.verizon.com/about/sitemap-careers.xml
Fetching: https://www.verizon.com/about/sitemap-careers.xml
  Fetching nested sitemap: https://www.verizon.com/about/sitemap-parenting.xml
Fetching: https://www.verizon.com/about/sitemap-parenting.xml
  Fetching nested sitemap: https://www.verizon.com/about/sitemap-news.xml
Fetching: https://www.verizo

In [59]:
len(target_page_urls)

43153

In [64]:
from urllib.parse import urlparse
from collections import Counter

# Extract the first segment after '/about/' from each URL
about_sections = []
for u in target_page_urls:
    parsed = urlparse(u)
    path = parsed.path
    if path.startswith('/about/'):
        rest = path[len('/about/'):]
        section = rest.split('/', 1)[0] if '/' in rest else rest
        if section:
            about_sections.append(section)

# Count occurrences of each section
section_counts = Counter(about_sections)

# Display all section counts
for section, count in section_counts.items():
    print(f"{section}: {count}")  # Top 20 sections


news: 41646
responsibility: 72
es: 3
parenting: 246
news-tag: 88
investors: 392
2017-mobile-world-congress-americas: 1
our-company: 304
newcomponentstest07920: 1
board-directors-new: 1
terms-conditions: 81
community: 1
fixed-income-new: 1
accessibility: 37
careers: 43
speed-live-stream-cloned: 1
component-two-column-examples: 1
race-social-justice-action-toolkit-old: 1
international: 41
verizon-work-tech-group: 1
privacy: 13
2021-huddle-audience: 1
stackedmediactacomponent07920: 1
test-page: 1
holiday: 2
speed-video-old: 1
about: 4
component-full-width-video-examples: 1
test-table-page-for-color-updates: 1
component-media-cta-2-all-right-aligned-black-background: 1
california-outage-map: 1
test-body-center-prod-test: 1
women-tech: 1
verizon-robotics: 1
components-news-pidw: 1
internet-service: 3
consumer-safety: 12
verizon-innovative-learning-north-carolina-tinkercad-stem: 1
adfellows: 1
component-smallimagecarouselcomponent07920: 1
frances-moffett: 1
call-for-kindness: 1
section-artic

news: 41646
responsibility: 72
es: 3
parenting: 246
news-tag: 88
investors: 392
2017-mobile-world-congress-americas: 1
our-company: 304
newcomponentstest07920: 1
board-directors-new: 1
terms-conditions: 81
community: 1
fixed-income-new: 1
accessibility: 37
careers: 43
speed-live-stream-cloned: 1
component-two-column-examples: 1
race-social-justice-action-toolkit-old: 1
international: 41
verizon-work-tech-group: 1
privacy: 13
2021-huddle-audience: 1
stackedmediactacomponent07920: 1
test-page: 1
holiday: 2
speed-video-old: 1
about: 4
component-full-width-video-examples: 1
test-table-page-for-color-updates: 1
component-media-cta-2-all-right-aligned-black-background: 1
california-outage-map: 1
test-body-center-prod-test: 1
women-tech: 1
verizon-robotics: 1
components-news-pidw: 1
internet-service: 3
consumer-safety: 12
verizon-innovative-learning-north-carolina-tinkercad-stem: 1
adfellows: 1
component-smallimagecarouselcomponent07920: 1
frances-moffett: 1
call-for-kindness: 1
section-artic

In [62]:
# Filter out sections with count == 1 and sort by count descending
filtered_sections = [(section, count) for section, count in section_counts.items() if count > 1]
filtered_sections_sorted = sorted(filtered_sections, key=lambda x: x[1], reverse=True)

print(filtered_sections_sorted)

[('news', 41646), ('investors', 392), ('our-company', 304), ('parenting', 246), ('news-tag', 88), ('terms-conditions', 81), ('responsibility', 72), ('careers', 43), ('international', 41), ('accessibility', 37), ('privacy', 13), ('consumer-safety', 12), ('about', 4), ('section-article-tag', 4), ('es', 3), ('internet-service', 3), ('International', 3), ('holiday', 2), ('holiday-quiz', 2)]


In [63]:
filtered_sections_sorted

[('news', 41646),
 ('investors', 392),
 ('our-company', 304),
 ('parenting', 246),
 ('news-tag', 88),
 ('terms-conditions', 81),
 ('responsibility', 72),
 ('careers', 43),
 ('international', 41),
 ('accessibility', 37),
 ('privacy', 13),
 ('consumer-safety', 12),
 ('about', 4),
 ('section-article-tag', 4),
 ('es', 3),
 ('internet-service', 3),
 ('International', 3),
 ('holiday', 2),
 ('holiday-quiz', 2)]

In [65]:
# Define the sections we want to keep
sections_to_keep = [
    'investors', 'our-company',  'terms-conditions',
    'responsibility', 'privacy', 'about', 'parenting', 'consumer-safety', 'international'
] # , 'international'


MAX_PAGES_PER_SECTION = 100
section_counts = {section: 0 for section in sections_to_keep}
filtered_target_page_urls = []

for u in target_page_urls:
    parsed = urlparse(u)
    path = parsed.path
    if path.startswith('/about/'):
        rest = path[len('/about/'):]
        section = rest.split('/', 1)[0] if '/' in rest else rest
        if section in sections_to_keep and section_counts[section] < MAX_PAGES_PER_SECTION:
            filtered_target_page_urls.append(u)
            section_counts[section] += 1

print(f"\nFiltered URLs to {len(filtered_target_page_urls)} pages across specified sections.")


Filtered URLs to 523 pages across specified sections.


In [66]:
# Extract the first segment after '/about/' from each URL
filtered_target_page_sections = []
for u in filtered_target_page_urls:
    parsed = urlparse(u)
    path = parsed.path
    if path.startswith('/about/'):
        rest = path[len('/about/'):]
        filtered_section = rest.split('/', 1)[0] if '/' in rest else rest
        if filtered_section:
            filtered_target_page_sections.append(filtered_section)





filtered_target_page_urls_counter = Counter(filtered_target_page_sections)
for section, count in filtered_target_page_urls_counter.items():
    print(f"{section}: {count}")  

responsibility: 72
parenting: 100
investors: 100
our-company: 100
terms-conditions: 81
international: 41
privacy: 13
about: 4
consumer-safety: 12


In [67]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Example in fetch_url_content_requests
def fetch_url_content_requests(url, stream=False):
    logger.info(f"Fetching: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=15, stream=stream)
        response.raise_for_status()
        return response
    except requests.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None
    finally:
        if not stream:
            time.sleep(REQUEST_DELAY)

In [68]:
import trafilatura
import requests

# Optionally, set trafilatura to output markdown
# (requires trafilatura >=1.5.0)
TRAFILATURA_MARKDOWN = True  # Set to False for plain text

if not filtered_target_page_urls:
    print("No URLs to process in filtered_target_page_urls_limited.")
else:
    trafilatura_results = []
    for url in filtered_target_page_urls:
        try:
            # response = requests.get(url, timeout=15) 
            response = fetch_url_content_requests(url, stream=False)  # Use our custom function
            if response.status_code == 200:
                downloaded = response.text
                extracted = trafilatura.extract(
                    downloaded,
                    include_comments=False,
                    include_tables=True,
                    output_format="markdown" if TRAFILATURA_MARKDOWN else "txt"
                )
                trafilatura_results.append({
                    'url': url,
                    'content': extracted
                })
            else:
                trafilatura_results.append({
                    'url': url,
                    'content': None
                })
        except Exception as e:
            print(f"Error processing {url}: {e}")
            trafilatura_results.append({
                'url': url,
                'content': None
            })

    print(f"Extracted content for {len(trafilatura_results)} URLs using requests + trafilatura.")

    # Example: print a snippet of the first result
    if trafilatura_results and trafilatura_results[0]['content']:
        print(trafilatura_results[0]['content'][:1000])

INFO:__main__:Fetching: https://www.verizon.com/about/responsibility


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-gen-z-spent-year-online-and-what-it-means-now
INFO:__main__:Fetching: https://www.verizon.com/about/investors/sellside-analyst-meeting
INFO:__main__:Fetching: https://www.verizon.com/about/investors/corporate-governance/finance-committee
INFO:__main__:Fetching: https://www.verizon.com/about/investors/citi-21st-annual-global-entertainment-media-telecommunications-conference
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/grant-requirements
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/screen-time-kids-how-create-stress-free-routine
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/content-policies
INFO:__main__:Fetching: https://www.verizon.com/about/investors/morgan-stanley-virtual-european-technology-media-telecom-conference-2020
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/digital-bullying-and-exclusion-how-keep-our-kids-emotio

Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/education-revolution: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/investors/mark-t-bertolini-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/mark-t-bertolini-old
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/overview


Error processing https://www.verizon.com/about/investors/mark-t-bertolini-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/teach-black-history-3-immersive-experiences
INFO:__main__:Fetching: https://www.verizon.com/about/investors/oppenheimer-18th-annual-technology-internet-communications-conference
INFO:__main__:Fetching: https://www.verizon.com/about/investors/asset-backed-securitization
INFO:__main__:Fetching: https://www.verizon.com/about/investors/verizon-wireless-offers-simple-affordable-convenience-new-unlimited-voice-plans
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/verizon-wireless-offers-simple-affordable-convenience-new-unlimited-voice-plans: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/verizon-wireless-offers-simple-affordable-convenience-new-unlimited-voice-plans
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/teen-driving-app


Error processing https://www.verizon.com/about/investors/verizon-wireless-offers-simple-affordable-convenience-new-unlimited-voice-plans: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/screens-teens-and-sleep-how-outsmart-nightly-routine
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/user-guides
INFO:__main__:Fetching: https://www.verizon.com/about/investors/independent-accountants-review-letter
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/network-management
INFO:__main__:Fetching: https://www.verizon.com/about/investors/morgan-stanley-technology-media-telecom-conference-2022
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/my-kid-gaming-too-much
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/best-2023-holiday-tech-gift-ideas-and-deals
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/who-is-generation-alpha
INFO:__main__:Fetching: https://www.verizon.com/about/investors/corporate-governance/alignment-with-isg-principles
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/

Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/caution-only-proceed-artificial-intelligence: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/2q-2019-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/investors/sellside-analyst-meeting-nov-19
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/so-your-kid-hates-video-chats-now-what
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/video-content-what-your-teen-watches
INFO:__main__:Fetching: https://www.verizon.com/about/investors/moffettnathanson-media-and-communications-summit-2016
INFO:__main__:Fetching: https://www.verizon.com/about/investors/unsubscribe-email-alerts
INFO:__main__:Fetching: https://www.verizon.com/about/investors/goldman-sachs-27th-annual-communacopia-conference
INFO:__main__:Fetching: https://www.verizon.com/about/investors/38th-annual-jp-morgan-global-technology-media-and-telcom-conference
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/besafe-values-lifesaving-principl

Error processing https://www.verizon.com/about/our-company/executive-bios/tony-skiadas-flexpage: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/supplier-diversity-faqs
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/4q-2022-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/binding-corporate-rules
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/retrain-social-media-algorithm
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/john-g-stratton
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/john-g-stratton: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/john-g-stratton
INFO:__main__:Fetching: https://www.verizon.com/about/investors/2024-annual-meeting-voting-results


Error processing https://www.verizon.com/about/our-company/executive-bios/john-g-stratton: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/global-tax-policy
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/collect-calling-scam
INFO:__main__:Fetching: https://www.verizon.com/about/investors/wells-fargo-virtual-media-telco-day
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/5g/understanding-5g-spectrum
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/5g/understanding-5g-spectrum: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/5g/understanding-5g-spectrum
INFO:__main__:Fetching: https://www.verizon.com/about/investors/bofa-securities-2023-media-communications-entertainment-conference


Error processing https://www.verizon.com/about/our-company/5g/understanding-5g-spectrum: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/5g/why-5g-crucial-smart-city-tomorrow
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/5g/why-5g-crucial-smart-city-tomorrow: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/5g/why-5g-crucial-smart-city-tomorrow
INFO:__main__:Fetching: https://www.verizon.com/about/investors/kathryn-tesija


Error processing https://www.verizon.com/about/our-company/5g/why-5g-crucial-smart-city-tomorrow: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/investors/kathryn-tesija: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/kathryn-tesija
INFO:__main__:Fetching: https://www.verizon.com/about/investors/ubs-global-tmt-conference-2019


Error processing https://www.verizon.com/about/investors/kathryn-tesija: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/joe-russo-flexpage
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/joe-russo-flexpage: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/joe-russo-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/smart-locator


Error processing https://www.verizon.com/about/our-company/executive-bios/joe-russo-flexpage: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/bofa-securities-2020-media-communications-entertainment-conference
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/accessibility
INFO:__main__:Fetching: https://www.verizon.com/about/investors/ubs-global-media-and-communications-conference-2024
INFO:__main__:Fetching: https://www.verizon.com/about/investors/deutsche-bank-media-internet-telecom-conference-2024
INFO:__main__:Fetching: https://www.verizon.com/about/investors/financial-reporting
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/parents-guide-talking-about-snapchat-keep-streak-alive
ERROR:__main__:Error fetching https://www.verizon.com/about/parenting/parents-guide-talking-about-snapchat-keep-streak-alive: 404 Client Error: Not Found for url: https://www.verizon.com/about/parenting/parents-guide-talking-about-snapchat-keep-streak-alive
INFO:__main__:Fetching: https://www.verizon.com/about/investors/jp-morgan-global-technology-

Error processing https://www.verizon.com/about/parenting/parents-guide-talking-about-snapchat-keep-streak-alive: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/privacy/BCRparticipants
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/verizon-internal-systems-information-security-exhibit
INFO:__main__:Fetching: https://www.verizon.com/about/about/our-company/enterprise-security
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/your-new-home-may-be-printed
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/your-new-home-may-be-printed: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/your-new-home-may-be-printed
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-bundle


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/your-new-home-may-be-printed: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/overseas-money-transfer-scam
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/online-learning
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/bundle-worry-free-guarantee
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/product-responsibility-new
ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/product-responsibility-new: 404 Client Error: Not Found for url: https://www.verizon.com/about/responsibility/product-responsibility-new
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-pt


Error processing https://www.verizon.com/about/responsibility/product-responsibility-new: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/what-age-should-child-get-cell-phone
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/talk-world-international-plan-business-digital-voice
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/fios-digital-voice
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/what-we-do/fiber
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/what-we-do/fiber: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/what-we-do/fiber
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/description-of-processing-personal-data-verizon-connect


Error processing https://www.verizon.com/about/our-company/what-we-do/fiber: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/annual-enrollment
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/annual-enrollment: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/annual-enrollment
INFO:__main__:Fetching: https://www.verizon.com/about/investors/ubs-global-tmt-virtual-conference-2021


Error processing https://www.verizon.com/about/our-company/annual-enrollment: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/analyst-meeting-webcast-including-post-broadcast-incentive-auction-spectrum-discussion
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/blockchain-may-be-great-news-news-business
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/blockchain-may-be-great-news-news-business: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/blockchain-may-be-great-news-news-business
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/required-paper-free-billing-terms-service


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/blockchain-may-be-great-news-news-business: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/4q-2013-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/investors/sellside-analyst-meeting-june-18
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/hans-vestberg
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/your-iron-man-suit-ready-you-now
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/your-iron-man-suit-ready-you-now: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/your-iron-man-suit-ready-you-now
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/sustainability


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/your-iron-man-suit-ready-you-now: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/spot-doomscrolling-guide-teens-balance
INFO:__main__:Fetching: https://www.verizon.com/about/investors/morgan-stanley-technology-media-telecom-conference-2018
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/youve-never-seen-history-like-this
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/youve-never-seen-history-like-this: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/youve-never-seen-history-like-this
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/young-children


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/youve-never-seen-history-like-this: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/learning-about-machine-learning
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/learning-about-machine-learning: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/learning-about-machine-learning
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/music-for-kids-regulation-balance


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/learning-about-machine-learning: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/test-article-prod-story
ERROR:__main__:Error fetching https://www.verizon.com/about/parenting/test-article-prod-story: 404 Client Error: Not Found for url: https://www.verizon.com/about/parenting/test-article-prod-story
INFO:__main__:Fetching: https://www.verizon.com/about/investors/morgan-stanley-technology-media-telecom-conference-1


Error processing https://www.verizon.com/about/parenting/test-article-prod-story: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/standalone-service
INFO:__main__:Fetching: https://www.verizon.com/about/investors/jp-morgan-global-technology-media-and-communications-conference-2025
INFO:__main__:Fetching: https://www.verizon.com/about/investors/goldman-sachs-communacopia-technology-conference-2024
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/supplier-diversity
INFO:__main__:Fetching: https://www.verizon.com/about/investors/barclays-global-technology-media-and-telecommunications-conference-2020
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/gizmowatch-3-reasons-why-it-could-be-good-choice-your-child
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/about-fourth-industrial-revolution
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/2q-2018-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/investors/jefferies-2016-med

Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/fourth-industrial-revolution-built-5g: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/5-tips-kids-location-sharing-apps
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/premium-technical-support
ERROR:__main__:Error fetching https://www.verizon.com/about/terms-conditions/premium-technical-support: 404 Client Error: Not Found for url: https://www.verizon.com/about/terms-conditions/premium-technical-support
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/our-culture


Error processing https://www.verizon.com/about/terms-conditions/premium-technical-support: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/irl-raising-our-four-kids-digital-age
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/inside-wire-maintenance-business-wireline-customers-terms-service
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/5g/why-5g-important-discover-importance-5g-technology
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/5g/why-5g-important-discover-importance-5g-technology: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/5g/why-5g-important-discover-importance-5g-technology
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/acceptable-use-policy


Error processing https://www.verizon.com/about/our-company/5g/why-5g-important-discover-importance-5g-technology: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/value-added-services
ERROR:__main__:Error fetching https://www.verizon.com/about/terms-conditions/value-added-services: 404 Client Error: Not Found for url: https://www.verizon.com/about/terms-conditions/value-added-services
INFO:__main__:Fetching: https://www.verizon.com/about/investors/bank-america-merrill-lynch-2015-media-communications-entertainment-conference


Error processing https://www.verizon.com/about/terms-conditions/value-added-services: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-fr-ca
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/wireless-network
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/verizon-ventures
INFO:__main__:Fetching: https://www.verizon.com/about/investors/board-directors
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/responsible-marketing-policy
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/toxicity-online-gaming
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/1q-2008-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/identity-theft
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/buyer-agreement
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/1q-2007-quarter-earnings-conference-call-webcast
ERROR:

Error processing https://www.verizon.com/about/investors/quarterly-reports/1q-2007-quarter-earnings-conference-call-webcast: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/internet-safety-for-teens
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/employee-diversity-report
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/these-disruptors-are-pushing-back-against-techs-gender-disparity-gap
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/these-disruptors-are-pushing-back-against-techs-gender-disparity-gap: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/these-disruptors-are-pushing-back-against-techs-gender-disparity-gap
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/state-of-the-market-internet-of-things


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/these-disruptors-are-pushing-back-against-techs-gender-disparity-gap: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/state-of-the-market-internet-of-things: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/state-of-the-market-internet-of-things
INFO:__main__:Fetching: https://www.verizon.com/about/investors/credit-suisse-global-media-and-communications-convergence-conference


Error processing https://www.verizon.com/about/our-company/state-of-the-market-internet-of-things: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/investors/credit-suisse-global-media-and-communications-convergence-conference: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/credit-suisse-global-media-and-communications-convergence-conference
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/small-business-supplier


Error processing https://www.verizon.com/about/investors/credit-suisse-global-media-and-communications-convergence-conference: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-can-i-protect-my-kids-inappropriate-content
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/data-processing-activities
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/marni-m-walden
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/marni-m-walden: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/marni-m-walden
INFO:__main__:Fetching: https://www.verizon.com/about/investors/morgan-stanley-technology-media-telecom-conference-0


Error processing https://www.verizon.com/about/our-company/marni-m-walden: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-talk-about-cyberbullying
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/talent-education-and-entrepreneurship-4ir
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/talent-education-and-entrepreneurship-4ir: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/talent-education-and-entrepreneurship-4ir
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/4ir-lifelong-learning-will-be-rule-not-exception


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/talent-education-and-entrepreneurship-4ir: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/4ir-lifelong-learning-will-be-rule-not-exception: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/4ir-lifelong-learning-will-be-rule-not-exception
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/fire-island-ny-voice-link


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/4ir-lifelong-learning-will-be-rule-not-exception: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizons-unsolicited-bulk-email-policy
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/lost-or-stolen-phone
ERROR:__main__:Error fetching https://www.verizon.com/about/consumer-safety/lost-or-stolen-phone: 404 Client Error: Not Found for url: https://www.verizon.com/about/consumer-safety/lost-or-stolen-phone
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/craig-silliman-flexpage


Error processing https://www.verizon.com/about/consumer-safety/lost-or-stolen-phone: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/craig-silliman-flexpage: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/craig-silliman-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/investors/citi-2014-internet-media-telecommunications-conference


Error processing https://www.verizon.com/about/our-company/executive-bios/craig-silliman-flexpage: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/online-safety-new
ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/online-safety-new: 404 Client Error: Not Found for url: https://www.verizon.com/about/responsibility/online-safety-new
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/gizmo-watch-3-adventure-helps-free-range-parenting


Error processing https://www.verizon.com/about/responsibility/online-safety-new: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/melanie-l-healey-flexpage
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/melanie-l-healey-flexpage: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/melanie-l-healey-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/investors/jefferies-6th-annual-global-internet-media-telecom-conference-0


Error processing https://www.verizon.com/about/investors/melanie-l-healey-flexpage: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/investors/jefferies-6th-annual-global-internet-media-telecom-conference-0: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/jefferies-6th-annual-global-internet-media-telecom-conference-0
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/kyle-malady-old


Error processing https://www.verizon.com/about/investors/jefferies-6th-annual-global-internet-media-telecom-conference-0: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/kyle-malady-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/kyle-malady-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/email-policy


Error processing https://www.verizon.com/about/our-company/executive-bios/kyle-malady-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/parents-who-bully-tips-and-strategies
INFO:__main__:Fetching: https://www.verizon.com/about/investors/laxman-narasimhan-flexpage
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/laxman-narasimhan-flexpage: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/laxman-narasimhan-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/investors/wells-fargo-tmt-summit-2020


Error processing https://www.verizon.com/about/investors/laxman-narasimhan-flexpage: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/wells-fargo-technology-media-telecom-conference
INFO:__main__:Fetching: https://www.verizon.com/about/investors/analyst-meeting-including-5g-launch-news-release
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/media-forensics-combating-increasing-unreality-content
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/media-forensics-combating-increasing-unreality-content: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/media-forensics-combating-increasing-unreality-content
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/sustainability-moral-case-now-business-case


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/media-forensics-combating-increasing-unreality-content: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/sustainability-moral-case-now-business-case: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/sustainability-moral-case-now-business-case
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-to-reduce-screen-time


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/sustainability-moral-case-now-business-case: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/6-ways-drones-are-improving-life-saving-response-rescue-and-recovery-efforts
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/6-ways-drones-are-improving-life-saving-response-rescue-and-recovery-efforts: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/6-ways-drones-are-improving-life-saving-response-rescue-and-recovery-efforts
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/text-abbreviations-and-acronyms-every-parent-should-know-new


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/6-ways-drones-are-improving-life-saving-response-rescue-and-recovery-efforts: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/parenting/text-abbreviations-and-acronyms-every-parent-should-know-new: 404 Client Error: Not Found for url: https://www.verizon.com/about/parenting/text-abbreviations-and-acronyms-every-parent-should-know-new
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/4q-2017-quarter-earnings-conference-call-webcast


Error processing https://www.verizon.com/about/parenting/text-abbreviations-and-acronyms-every-parent-should-know-new: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/dr-amanda-parkes
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/dr-amanda-parkes: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/dr-amanda-parkes
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-cz


Error processing https://www.verizon.com/about/our-company/executive-bios/dr-amanda-parkes: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/essential-tech-gifts-for-graduates
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-virtual-reality-changing-world-content-design
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-virtual-reality-changing-world-content-design: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-virtual-reality-changing-world-content-design
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-pl


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-virtual-reality-changing-world-content-design: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/state-government-affairs
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/want-know-your-foods-secrets-blockchain-knows
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/want-know-your-foods-secrets-blockchain-knows: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/want-know-your-foods-secrets-blockchain-knows
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/ariel-waldman


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/want-know-your-foods-secrets-blockchain-knows: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/ariel-waldman: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/ariel-waldman
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/verizons-efforts-combat-online-child-exploitation-faqs


Error processing https://www.verizon.com/about/our-company/executive-bios/ariel-waldman: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/tech-talk-exploring-emotional-side-ai-kids-and-teens
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/auto-pay
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/account-security
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/4q-2011-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/investors/second-quarter-2024-earnings
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/second-quarter-2024-earnings: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/second-quarter-2024-earnings
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/9-0-scam


Error processing https://www.verizon.com/about/investors/second-quarter-2024-earnings: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/responsible-business-reporting
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/residential-terms-service/home-security-terms-of-service
ERROR:__main__:Error fetching https://www.verizon.com/about/terms-conditions/residential-terms-service/home-security-terms-of-service: 404 Client Error: Not Found for url: https://www.verizon.com/about/terms-conditions/residential-terms-service/home-security-terms-of-service
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/4q-2012-quarter-earnings-conference-call-webcast


Error processing https://www.verizon.com/about/terms-conditions/residential-terms-service/home-security-terms-of-service: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-it
ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-it: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-it
INFO:__main__:Fetching: https://www.verizon.com/about/investors/ubs-43rd-annual-global-media-and-communications-conference


Error processing https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-it: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/5g/internet-things-will-thrive-5g-technology
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/5g/internet-things-will-thrive-5g-technology: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/5g/internet-things-will-thrive-5g-technology
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/digital-billing


Error processing https://www.verizon.com/about/our-company/5g/internet-things-will-thrive-5g-technology: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/voice-link
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/leslie-berland
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/new-york-petitions-and-exhibits
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/new-york-petitions-and-exhibits: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/new-york-petitions-and-exhibits
INFO:__main__:Fetching: https://www.verizon.com/about/investors/cost-basis-calculator-old


Error processing https://www.verizon.com/about/our-company/new-york-petitions-and-exhibits: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/investors/cost-basis-calculator-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/cost-basis-calculator-old
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/verizon-broadband-commitment


Error processing https://www.verizon.com/about/investors/cost-basis-calculator-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-online-terms-service-verizon-business-internet-and-value-added-services
INFO:__main__:Fetching: https://www.verizon.com/about/investors/tax-information
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/3q-2018-quarter-earnings-conference-call-webcast-0
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/quarterly-reports/3q-2018-quarter-earnings-conference-call-webcast-0: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/quarterly-reports/3q-2018-quarter-earnings-conference-call-webcast-0
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-nl


Error processing https://www.verizon.com/about/investors/quarterly-reports/3q-2018-quarter-earnings-conference-call-webcast-0: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-nl: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-nl
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/ronan-dunne


Error processing https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-nl: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/ronan-dunne: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/ronan-dunne
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/alexa-what-is-digital-parenting


Error processing https://www.verizon.com/about/our-company/executive-bios/ronan-dunne: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/gregory-g-weaver-old
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/gregory-g-weaver-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/gregory-g-weaver-old
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-fr


Error processing https://www.verizon.com/about/investors/gregory-g-weaver-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/when-kids-need-tablets
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/shared-digital-responsibility
INFO:__main__:Fetching: https://www.verizon.com/about/investors/barclays-global-communications-media-technology-conference
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/navigate-anxiety-in-kids-tech-filled-world
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/true-digital-natives-have-arrived-them-even-greater-digital-divide
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/true-digital-natives-have-arrived-them-even-greater-digital-divide: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/true-digital-natives-have-arrived-them-even-greater-digital-divide
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/kids-phone-5-steps


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/true-digital-natives-have-arrived-them-even-greater-digital-divide: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/innovation-programs
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/new-rules-gaming-sportsmanship
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/school-districts-buses-wi-fi-quarantine
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/share-locations-verizon-family-app
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/jeff-kirschner
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/jeff-kirschner: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/jeff-kirschner
INFO:__main__:Fetching: https://www.verizon.com/about/investors/investor-meeting-feb-21


Error processing https://www.verizon.com/about/our-company/executive-bios/jeff-kirschner: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/environmental-responsibility
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/auto-pay-postpaid-non-paperless-customers
INFO:__main__:Fetching: https://www.verizon.com/about/investors/vittorio-colao-old
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/vittorio-colao-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/vittorio-colao-old
INFO:__main__:Fetching: https://www.verizon.com/about/investors/verizon-acquire-frontier-communications


Error processing https://www.verizon.com/about/investors/vittorio-colao-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/what-is-catfishing
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/5g/what-5g-old
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/5g/what-5g-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/5g/what-5g-old
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/awards-recognition


Error processing https://www.verizon.com/about/our-company/5g/what-5g-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/1q-2023-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/investors/citi-2023-communications-media-entertainment-conference
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/5g/what-small-cell-technology
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/5g/what-small-cell-technology: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/5g/what-small-cell-technology
INFO:__main__:Fetching: https://www.verizon.com/about/investors/citi-2019-global-tmt-west-conference


Error processing https://www.verizon.com/about/our-company/5g/what-small-cell-technology: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/clarence-otis-jr-lead-director-old
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/clarence-otis-jr-lead-director-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/clarence-otis-jr-lead-director-old
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/what-comes-after-keyboard


Error processing https://www.verizon.com/about/investors/clarence-otis-jr-lead-director-old: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/what-comes-after-keyboard: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/what-comes-after-keyboard
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/computer-best-detective


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/what-comes-after-keyboard: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/4q-2019-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-screen-time-impacts-your-kids
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/recent-changes-business-terms-service-and-agreements
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/wearables-create-healthy-habits
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/international-privacy-centre
INFO:__main__:Fetching: https://www.verizon.com/about/investors/deutsche-bank-2015-media-internet-telecom-conference
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/design-thinking
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/usps-request-prepaid-mailing-label-old
ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/usps-request-prepaid-mailing-label-old: 404 Client Error: Not Found for url: https:

Error processing https://www.verizon.com/about/responsibility/usps-request-prepaid-mailing-label-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/test-article-parenting
ERROR:__main__:Error fetching https://www.verizon.com/about/parenting/test-article-parenting: 404 Client Error: Not Found for url: https://www.verizon.com/about/parenting/test-article-parenting
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/rima-qureshi-flexpage


Error processing https://www.verizon.com/about/parenting/test-article-parenting: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/rima-qureshi-flexpage: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/rima-qureshi-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/investors/wells-fargo-1st-annual-tech-transformation-summit


Error processing https://www.verizon.com/about/our-company/executive-bios/rima-qureshi-flexpage: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/affordable-care-act-transparency-in-coverage
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/what-emojis-mean
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/wireless-phone-security
ERROR:__main__:Error fetching https://www.verizon.com/about/consumer-safety/wireless-phone-security: 404 Client Error: Not Found for url: https://www.verizon.com/about/consumer-safety/wireless-phone-security
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/parenting-in-a-digital-world


Error processing https://www.verizon.com/about/consumer-safety/wireless-phone-security: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/guggenheim-securities-2012-tmt-symposium
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/commitment-to-pay-equity
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/guru-gowrappan
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/guru-gowrappan: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/guru-gowrappan
INFO:__main__:Fetching: https://www.verizon.com/about/investors/shellye-l-archambeau


Error processing https://www.verizon.com/about/our-company/executive-bios/guru-gowrappan: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-de-la-oferta
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/screen-time-tips
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/craig-silliman
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/1q-2016-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/verizon-innovative-learning/because-we-can-story
INFO:__main__:Fetching: https://www.verizon.com/about/investors/oppenheimer-26th-annual-technology-internet-communications-conference
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/5G/how-5g-will-level-gaming-and-esports
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/5G/how-5g-will-level-gaming-and-esports: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/5G/how-5g-will-level-gaming-and-esports
INFO:__ma

Error processing https://www.verizon.com/about/our-company/5G/how-5g-will-level-gaming-and-esports: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/3q-2020-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/help-kids-focus
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-smart-your-world
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-smart-your-world: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-smart-your-world
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/2q-2015-quarter-earnings-conference-call-webcast


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-smart-your-world: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-pandemic-has-changed-my-parenting
INFO:__main__:Fetching: https://www.verizon.com/about/investors/agreement-acquire-vodafones-45-interest-verizon-wireless
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/online-scams-in-games
INFO:__main__:Fetching: https://www.verizon.com/about/investors/goldman-sachs-communacopia-xx-conference
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/universal-design-principles
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/1q-2009-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/verizon-disability-accommodations
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/robert-mudge
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/robert-mudge: 404 Client Error: Not Found for 

Error processing https://www.verizon.com/about/our-company/executive-bios/robert-mudge: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/domestic-violence-prevention
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/2q-2024-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/rose-stuckey-kirk
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/become-verizon-innovative-learning-school
ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/become-verizon-innovative-learning-school: 404 Client Error: Not Found for url: https://www.verizon.com/about/responsibility/become-verizon-innovative-learning-school
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/ethnic-group-descriptions


Error processing https://www.verizon.com/about/responsibility/become-verizon-innovative-learning-school: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/kyle-malady
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/parents-guide-back-school-tech-kids
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/1q-2011-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-it
ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/international-policy-it: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/international-policy-it
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/emissions-profile


Error processing https://www.verizon.com/about/international/privacy/international-policy-it: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/emissions-profile: 404 Client Error: Not Found for url: https://www.verizon.com/about/responsibility/emissions-profile
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/paper-free-billing


Error processing https://www.verizon.com/about/responsibility/emissions-profile: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/can-computer-be-prejudiced
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/can-computer-be-prejudiced: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/can-computer-be-prejudiced
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/verizon-ventures/portfolio-companies


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/can-computer-be-prejudiced: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/it-parenting-or-sharenting
INFO:__main__:Fetching: https://www.verizon.com/about/investors/verizon-reaches-agreement-acquire-vodafones-45-percent-interest-verizon-wireless-130
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/verizon-reaches-agreement-acquire-vodafones-45-percent-interest-verizon-wireless-130: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/verizon-reaches-agreement-acquire-vodafones-45-percent-interest-verizon-wireless-130
INFO:__main__:Fetching: https://www.verizon.com/about/investors/jefferies-2015-tmt-conference


Error processing https://www.verizon.com/about/investors/verizon-reaches-agreement-acquire-vodafones-45-percent-interest-verizon-wireless-130: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/investors/verizon-improves-method-accounting-pensions-and-other-post-employment-benefits
ERROR:__main__:Error fetching https://www.verizon.com/about/investors/verizon-improves-method-accounting-pensions-and-other-post-employment-benefits: 404 Client Error: Not Found for url: https://www.verizon.com/about/investors/verizon-improves-method-accounting-pensions-and-other-post-employment-benefits
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/ronan-dunne-old


Error processing https://www.verizon.com/about/investors/verizon-improves-method-accounting-pensions-and-other-post-employment-benefits: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/ronan-dunne-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/ronan-dunne-old
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/business-terms-of-service


Error processing https://www.verizon.com/about/our-company/executive-bios/ronan-dunne-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-pt
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-holograms-work
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-holograms-work: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-holograms-work
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/history-and-timeline/1999-1900


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-holograms-work: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-online
INFO:__main__:Fetching: https://www.verizon.com/about/investors/quarterly-reports/4q-2016-quarter-earnings-conference-call-webcast
INFO:__main__:Fetching: https://www.verizon.com/about/investors/barclays-communications-and-content-symposium-2025
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/what-we-do/municipal
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/what-we-do/municipal: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/what-we-do/municipal
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/history-and-timeline


Error processing https://www.verizon.com/about/our-company/what-we-do/municipal: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-terms-service-purchasers-business-dsl-service-jan-9-2006
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/marc-c-reed
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/marc-c-reed: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/marc-c-reed
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/politica-de-desbloqueo-de-dispositivos-pospagados


Error processing https://www.verizon.com/about/our-company/executive-bios/marc-c-reed: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terms-of-use
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/what-we-do-old
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/what-we-do-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/what-we-do-old
INFO:__main__:Fetching: https://www.verizon.com/about/investors/oppenheimer-co-annual-technology-media-telecommunications-conference


Error processing https://www.verizon.com/about/our-company/what-we-do-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-internet-safety-expert-really-talks-her-tweens-about-internet-safety
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/professional-sports-stadiums-are-breeding-ground-fourth-industrial-revolution
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/professional-sports-stadiums-are-breeding-ground-fourth-industrial-revolution: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/professional-sports-stadiums-are-breeding-ground-fourth-industrial-revolution
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/account-security/file-a-fraud-claim


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/professional-sports-stadiums-are-breeding-ground-fourth-industrial-revolution: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/online-safety-safer-internet-day
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/streaming-holidays-family-playlists
INFO:__main__:Fetching: https://www.verizon.com/about/investors/bank-america-merrill-lynch-2010-media-communications-entertainment-conference
INFO:__main__:Fetching: https://www.verizon.com/about/investors/jefferies-2013-global-technology-media-and-telecom-conference
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/signs-of-sadfishing-parents-guide
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/age-appropriate-guide-to-using-smart-family-app
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/powering-fourth-industrial-revolution-5g
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/powering-fourth-industrial-revolution-5g: 404 Client Error: Not Found for url: https://w

Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/powering-fourth-industrial-revolution-5g: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/gizmo-watch-3-caring-for-people-with-Autism
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/gaming-life-skills-kids
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-take-more-intuitive-approach-setting-screen-time-limits
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/dc-notices
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/verizon-innovative-learning/transforming-texas-school-district-technology
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/4-tips-virtual-family-party
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-you-make-digital-human
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-you-make-digital-human: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-you-m

Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-you-make-digital-human: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/esrb-ratings-game-appropriate-for-my-kid
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-fr
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/would-you-pay-400000-painting-made-computer
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/would-you-pay-400000-painting-made-computer: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/would-you-pay-400000-painting-made-computer
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/5g/what-millimeter-wave-technology


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/would-you-pay-400000-painting-made-computer: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/5g/what-millimeter-wave-technology: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/5g/what-millimeter-wave-technology
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/slamming-and-cramming


Error processing https://www.verizon.com/about/our-company/5g/what-millimeter-wave-technology: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/first-phone-kid-myPlan
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/diego-scotti-flexpage
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/diego-scotti-flexpage: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/diego-scotti-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/we-need-stop-talking-about-years-our-kids-lost-school


Error processing https://www.verizon.com/about/our-company/executive-bios/diego-scotti-flexpage: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/innovation-labs
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/company-policies/human-rights-statement
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/digital-activism
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/who-we-are
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/may-change-medicine-forever
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/craig-silliman-old
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/craig-silliman-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/craig-silliman-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/hans-vestberg


Error processing https://www.verizon.com/about/our-company/executive-bios/craig-silliman-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/create-screen-time-agreement-your-family
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/what-parents-need-to-know-about-esports
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/talk-to-your-kids-digital-footprint
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/guru-gowrappan-old
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/guru-gowrappan-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/guru-gowrappan
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-to-be-a-guardian-for-your-kids-online-identity


Error processing https://www.verizon.com/about/our-company/executive-bios/guru-gowrappan-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-nl
ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-nl: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-nl
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/digital-inclusion/disability-innovation


Error processing https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-nl: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/our-company/executive-bios/hans-vestberg-flexpage
ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/executive-bios/hans-vestberg-flexpage: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/executive-bios/hans-vestberg-flexpage
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-augmented-reality-changing-way-we-shop


Error processing https://www.verizon.com/about/our-company/executive-bios/hans-vestberg-flexpage: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-augmented-reality-changing-way-we-shop: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-augmented-reality-changing-way-we-shop
INFO:__main__:Fetching: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-ai-understands-emotion


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-augmented-reality-changing-way-we-shop: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-ai-understands-emotion: 404 Client Error: Not Found for url: https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-ai-understands-emotion
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/teen-driver-safety-5-ways-curb-distracted-driving


Error processing https://www.verizon.com/about/our-company/fourth-industrial-revolution/how-ai-understands-emotion: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/about/our-company/5g/future-education-technology
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/monitor-childs-texting-activity-verizon-family
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/safe-walk-trick-or-treating
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/third-party-billing
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/family-road-trip-traveling-tech-teens
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/device-unlocking-policy
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/family-gaming-guide
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/recent-changes-international-privacy-policy
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/5-ways-use-tech-pet-care-routine
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-de-servicio-de-verizon-fios-t

Error processing https://www.verizon.com/about/privacy/cookies-related-technologies-new: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/extended-service-plan-verizon-online-high-speed-internet-subscribers
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/tech-gifts-2024-holiday-guide
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/verizon-innovative-learning/web-dubois-academy-its-all-about-pride
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-de
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-cn
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/fios-digital-voice-inside-wire-maintenance-plan
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/take-your-kids-virtual-field-trip-right-now
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/disaster-resilience/community
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/social-media-classroom-parents-ne

Error processing https://www.verizon.com/about/responsibility/human-prosperity/verizon-skill-forward: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/screen-time-vs-screen-use-your-child-needs-you-know-difference
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/how-smart-family-manage-screen-time
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/spring-clean-familys-phones
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/video-streaming-what-kids-are-watching
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/screen-time-for-kids-restore-balance
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/avis-de-confidentialite-de-verizon
ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/avis-de-confidentialite-de-verizon: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/avis-de-confidentialite-de-verizon
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/sbdc-conference


Error processing https://www.verizon.com/about/international/privacy/avis-de-confidentialite-de-verizon: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/social-games-parents-support-next-gen-gamers
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/international-policy
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/verizon-innovation-learning-labs
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/smart-summer-travel-best-devices
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/verizon-innovative-learning-locations
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/my-rewards
ERROR:__main__:Error fetching https://www.verizon.com/about/terms-conditions/my-rewards: 404 Client Error: Not Found for url: https://www.verizon.com/about/terms-conditions/my-rewards
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/wireless-emergency-alerts


Error processing https://www.verizon.com/about/terms-conditions/my-rewards: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/consumer-safety/wireless-emergency-alerts: 404 Client Error: Not Found for url: https://www.verizon.com/about/consumer-safety/wireless-emergency-alerts
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-web-site-use-agreement


Error processing https://www.verizon.com/about/consumer-safety/wireless-emergency-alerts: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/parenting/family-road-trip-tech-essentials
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/verizon-family-parental-control-app
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/verizon-innovative-learning/preparing-for-a-digital-future
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/multiplayer-vr-games-parents-safety-guide
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/latin-culture-family-conversations
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/safe-vr-room-for-kids
INFO:__main__:Fetching: https://www.verizon.com/about/parenting/houseparty-online-family-game-night
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/grants-and-partnerships
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/employee-volunteers
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-de-la-oferta-worry-fre

Error processing https://www.verizon.com/about/about/our-company/what-we-do/building-the-future/smart-cities: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/phishing-and-smishing
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/72-prison-scam
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-and-scams-old
ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/frauds-and-scams-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/responsibility/frauds-and-scams-old
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/derecho-cancelar


Error processing https://www.verizon.com/about/responsibility/frauds-and-scams-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-y-condiciones-del-plan-de-mantenimiento-del-cableado-interno-de-verizon
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/social-engineering
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/digital-inclusion/small-business-training-old
ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/digital-inclusion/small-business-training-old: 404 Client Error: Not Found for url: https://www.verizon.com/about/responsibility/digital-inclusion/small-business-training-old
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/verizon-enterprise-privacy-notice


Error processing https://www.verizon.com/about/responsibility/digital-inclusion/small-business-training-old: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/business-internet-auto-pay
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/digital-inclusion
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-be
ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/international-policy-be: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/international-policy-be
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/pride-events


Error processing https://www.verizon.com/about/international/privacy/international-policy-be: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/pride-events: 404 Client Error: Not Found for url: https://www.verizon.com/about/responsibility/pride-events
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/standalone-service-worry-free-guarantee-spanish


Error processing https://www.verizon.com/about/responsibility/pride-events: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-pt-br
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/intuit
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/important-consumer-information
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/radio-frequency-emissions
ERROR:__main__:Error fetching https://www.verizon.com/about/consumer-safety/radio-frequency-emissions: 404 Client Error: Not Found for url: https://www.verizon.com/about/consumer-safety/radio-frequency-emissions
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/hopeline-faqs


Error processing https://www.verizon.com/about/consumer-safety/radio-frequency-emissions: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/overview
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/contractor-privacy-notice
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/avoiding-potential-hearing-loss
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-de-la-oferta-standalone
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terms-and-conditions-verizon-inside-wire-maintenance
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-wi-fi-terms-of-service
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-kr
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/web-com-master-services-agreement
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/digital-inclusion/digital-wellness
INFO:__main__:Fetching: https://www.verizon.com/about/international/p

Error processing https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-it: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/schools
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/minority-males
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/get-involved
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-nl-be
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-de-servicio-de-verizon-online
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/credit-card-number-theft
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/auto-pay-paper-free-billing
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terms-service-fios-tv-business
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/network-disclosures-2017-2021
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/involvesoft-end-user-notice
INFO:__main__:Fetching

Error processing https://www.verizon.com/about/consumer-safety/enhanced-911: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/terms-conditions/verizon-copyright-alert-program: 404 Client Error: Not Found for url: https://www.verizon.com/about/terms-conditions/verizon-copyright-alert-program
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/subscribe-updates


Error processing https://www.verizon.com/about/terms-conditions/verizon-copyright-alert-program: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/houston-economic-growth-collaborative
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/cybersecurity
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-customer-agreement
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/eu-candidate-privacy-notice
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/network-disclosures/2022-2024
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/residential-terms-service
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/jury-duty-telephone-scam
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/verizon-datenschutzhinweis
ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/verizon-datenschutzhinweis: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/verizon-datensc

Error processing https://www.verizon.com/about/international/privacy/verizon-datenschutzhinweis: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-br
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/privacyverklaring-van-verizon
ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/privacyverklaring-van-verizon: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/privacyverklaring-van-verizon
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-de-servicio-de-facturacion-de-forma-digital


Error processing https://www.verizon.com/about/international/privacy/privacyverklaring-van-verizon: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/network-disclosures
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-fios-prepaid
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/current-sub-processors-and-affiliates
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/business-digital-voice-reminder
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/product-responsibility
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/text-911-service-new
ERROR:__main__:Error fetching https://www.verizon.com/about/consumer-safety/text-911-service-new: 404 Client Error: Not Found for url: https://www.verizon.com/about/consumer-safety/text-911-service-new
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/human-prosperity


Error processing https://www.verizon.com/about/consumer-safety/text-911-service-new: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-fr
ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-fr: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-fr
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/the-impact-of-Verizons-Innovative-Learning-lab


Error processing https://www.verizon.com/about/international/privacy/eu-candidate-privacy-notice-fr: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-service-protection-plan
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/email-alias-policy
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/verizon-innovative-learning/middle-school-girls-find-community-verizon-innovative-learning
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/human-prosperity/health-equity
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/network-disclosures/archives
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/809-scam
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-ru
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-business-digital-voice-terms-service
INFO:__main__:Fetching: https://www.verizon.com/about/privacy/verizon-end-user-privacy-notice
INFO:__main__:Fetching:

Error processing https://www.verizon.com/about/responsibility/digital-inclusion/small-business-archived: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-cz
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-de
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-cn
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-kr
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-de-servicio-de-facturacion-electronica-requerida
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/digital-inclusion/verizon-innovative-learning
INFO:__main__:Fetching: https://www.verizon.com/about/consumer-safety/implantable-medical-devices
ERROR:__main__:Error fetching https://www.verizon.com/about/consumer-safety/implantable-medical-devices: 404 Client Error: Not Found for url: https://www.verizon.com/about/consumer-safety/i

Error processing https://www.verizon.com/about/consumer-safety/implantable-medical-devices: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/app-challenge
ERROR:__main__:Error fetching https://www.verizon.com/about/responsibility/app-challenge: 404 Client Error: Not Found for url: https://www.verizon.com/about/responsibility/app-challenge
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/auto-pay-prepaid-terms-and-conditions


Error processing https://www.verizon.com/about/responsibility/app-challenge: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/comments-policy
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-jp
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/climate-protection/climate-justice
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terms-service-verizon-fios-internet-service-business-january-9-2006
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/standalone-service-worry-free-guarantee
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/human-prosperity/reskilling-program
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/verizon-fios-tv
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/frauds-scams/worms-viruses-and-other-malicious-programs
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/online-safety
ERROR:__main__:Error fetching https://www.verizon.com/about

Error processing https://www.verizon.com/about/responsibility/online-safety: 'NoneType' object has no attribute 'status_code'


ERROR:__main__:Error fetching https://www.verizon.com/about/international/privacy/international-policy-nl: 404 Client Error: Not Found for url: https://www.verizon.com/about/international/privacy/international-policy-nl
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/digital-inclusion/small-business-training


Error processing https://www.verizon.com/about/international/privacy/international-policy-nl: 'NoneType' object has no attribute 'status_code'


INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/international-policy-de
INFO:__main__:Fetching: https://www.verizon.com/about/about/our-company/what-we-do/protecting-infrastructure
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/terminos-de-servicio-de-verizon-fios-prepaid
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/online-bill-presentment-and-use-small-business-online-portal
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/generally-available-terms-and-conditions-adsl
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/verizon-innovative-learning/stem-isnt-just-a-boy-thing
INFO:__main__:Fetching: https://www.verizon.com/about/international/privacy/recent-changes-international-privacy-policy-tr
INFO:__main__:Fetching: https://www.verizon.com/about/responsibility/disaster-resilience
INFO:__main__:Fetching: https://www.verizon.com/about/terms-conditions/bi-professional-man

Error processing https://www.verizon.com/about/international/privacy/data-subject-rights: 'NoneType' object has no attribute 'status_code'
Extracted content for 523 URLs using requests + trafilatura.
### Equipped to take on the future

### Working toward a

greener world## Our principles


“Our responsible business principles inform every decision we make and are part of our collective pledge as V Teamers to show up with integrity every day, in the service of all our stakeholders.”

**Hans Vestberg
Verizon Chief Executive Officer**

## Watch the highlights

View highlights of our signature community programs.

## Let's do good together- and double your impact

Donate to eligible charities when you round up your bill to the next dollar and we'll match your donation.


In [None]:
# trafilatura_results

[{'url': 'https://www.verizon.com/about/about/our-company/enterprise-security',
  'content': 'With digital footprints expanding, the attack surface for cyber breaches grows making organization more vulnerable. The way of doing business is changing rapidly. Disruptive business models, and accelerated adoption of new technologies like mobility and cloud bring security to the forefront of the conversation, it’s no longer considered an afterthought but an enabler of digital transformation.\n\nVerizon helps companies strengthen cyber resiliency across the enterprise in the following ways:\n\nKeep networks secure\n\nPrepare for and respond to incidents quickly\n\nHandle the complexity of modern security\n\nProvide expert support at any stage in the business life cycle\n\nAnd we’re the only provider recognized by industry analyst firm Gartner as a leader in both networking and security in its 2017 Gartner Magic Quadrant reports.\n\nWhether you are raising a concern or have only a question, we

## The Web Scrapping Pipeline

In [2]:
import json
import time
import logging
import os
from urllib.parse import urlparse
import requests
import xml.etree.ElementTree as ET
from tenacity import retry, stop_after_attempt, wait_exponential
import trafilatura
import spacy
from langchain_community.llms import Ollama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_neo4j import Neo4jGraph
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm

# --- Configuration ---
SITEMAP_INDEX_URL = "https://www.verizon.com/about/sitemap.xml"
BASE_DOMAIN = "www.verizon.com"
ABOUT_PATH_PREFIX = "/about/"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 (DissertationResearchBot/0.1)'
}
EDGE_DRIVER_PATH = "E:/Programms(installed)/EdgeWebDriver/msedgedriver.exe"
SELENIUM_TIMEOUT = 20 # Seconds to wait for elements

REQUEST_DELAY = 2  # Seconds between requests
SECTIONS_TO_KEEP = ['investors', 'our-company', 'terms-conditions', 'responsibility', 'privacy', 'about', 'parenting', 'consumer-safety', 'international']
MAX_PAGES_PER_SECTION = 150
PDF_DOWNLOAD_DIR = "downloaded_verizon_about_pdfs"
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo4j_password"  # Change this!
TRAFILATURA_MARKDOWN = True



# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, filename='kg_preparation.log', filemode='w')
logger = logging.getLogger(__name__)

# --- Initialize spaCy ---
nlp = spacy.load("en_core_web_sm")

# --- Initialize Neo4j ---
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

# --- Initialize Selenium for dynamic pages ---
def init_selenium_driver():
    service = Service(executable_path=EDGE_DRIVER_PATH)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    
    # options.add_argument(f'user-agent={HEADERS["User-Agent"]}')
    driver = webdriver.Edge(service=service, options=options)
    return  driver # webdriver.Chrome(options=options)



# --- Fetch Content with Retry and Selenium Fallback ---
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def fetch_url_content_requests(url, stream=False):
    """Fetches content from a URL with retry and error handling."""
    logger.info(f"Fetching: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=15, stream=stream)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url} with requests: {e}")
        # Try Selenium for dynamic content
        try:
            driver = init_selenium_driver()
            driver.get(url)
            time.sleep(2)  # Wait for page to load
            content = driver.page_source
            driver.quit()
            return type('obj', (object,), {'content': content, 'status_code': 200})
        except Exception as se:
            logger.error(f"Selenium failed for {url}: {se}")
            return None
    finally:
        if not stream:
            time.sleep(REQUEST_DELAY)

# --- Parse Sitemap to Get URLs ---
def get_all_page_urls_from_sitemap_index(sitemap_index_url):
    """Parses a sitemap index file and nested sitemaps to return page URLs."""
    all_page_urls = set()
    logger.info(f"Fetching sitemap index: {sitemap_index_url}")
    index_response = fetch_url_content_requests(sitemap_index_url)

    if not (index_response and index_response.content):
        logger.error(f"Failed to fetch or empty content for sitemap index: {sitemap_index_url}")
        return list(all_page_urls)

    try:
        index_root = ET.fromstring(index_response.content)
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

        if index_root.tag == f"{{{namespace['sitemap']}}}sitemapindex":
            logger.info(f"Parsing sitemap index: {sitemap_index_url}")
            nested_sitemap_locs = [
                s_loc.text for sitemap_tag in index_root.findall('sitemap:sitemap', namespace)
                if (s_loc := sitemap_tag.find('sitemap:loc', namespace)) is not None and s_loc.text
            ]
            for nested_sitemap_url in nested_sitemap_locs:
                logger.info(f"  Fetching nested sitemap: {nested_sitemap_url}")
                sitemap_response = fetch_url_content_requests(nested_sitemap_url)
                if sitemap_response and sitemap_response.content:
                    try:
                        sitemap_root = ET.fromstring(sitemap_response.content)
                        if sitemap_root.tag == f"{{{namespace['sitemap']}}}urlset":
                            for url_element in sitemap_root.findall('sitemap:url', namespace):
                                page_loc_element = url_element.find('sitemap:loc', namespace)
                                if page_loc_element is not None and page_loc_element.text:
                                    all_page_urls.add(page_loc_element.text)
                    except ET.ParseError as e:
                        logger.error(f"    Error parsing nested sitemap XML {nested_sitemap_url}: {e}")
        elif index_root.tag == f"{{{namespace['sitemap']}}}urlset":
            logger.info(f"Parsing as a direct page sitemap: {sitemap_index_url}")
            for url_element in index_root.findall('sitemap:url', namespace):
                page_loc_element = url_element.find('sitemap:loc', namespace)
                if page_loc_element is not None and page_loc_element.text:
                    all_page_urls.add(page_loc_element.text)
        else:
            logger.error(f"Unknown root tag in sitemap file {sitemap_index_url}: {index_root.tag}")
    except ET.ParseError as e:
        logger.error(f"Error parsing sitemap index XML {sitemap_index_url}: {e}")
    return list(all_page_urls)

# --- Filter URLs by Section ---
def filter_urls_by_section(target_page_urls, sections, max_per_section):
    """Filters URLs by section and limits the number per section."""
    section_counts = {section: 0 for section in sections}
    filtered_urls = []

    for url in target_page_urls:
        parsed = urlparse(url)
        path = parsed.path
        if path.startswith(ABOUT_PATH_PREFIX):
            rest = path[len(ABOUT_PATH_PREFIX):]
            section = rest.split('/', 1)[0] if '/' in rest else rest
            if section in sections and section_counts[section] < max_per_section:
                filtered_urls.append(url)
                section_counts[section] += 1

    for section, count in section_counts.items():
        logger.info(f"{section}: {count} URLs")
    return filtered_urls

# --- Extract Text with Trafilatura ---
def extract_text_with_trafilatura(urls):
    """Extracts text from URLs using Trafilatura."""
    results = []
    for url in tqdm(urls, desc="Extracting text"):
        try:
            response = fetch_url_content_requests(url)
            if response and response.status_code == 200:
                extracted = trafilatura.extract(
                    response.content,
                    include_comments=False,
                    include_tables=True,
                    include_images=False,
                    include_links=False,
                    output_format="markdown" if TRAFILATURA_MARKDOWN else "txt"
                )
                results.append({'url': url, 'content': extracted})
            else:
                results.append({'url': url, 'content': None})
        except Exception as e:
            logger.error(f"Error processing {url}: {e}")
            results.append({'url': url, 'content': None})
    return results

# --- Preprocess Text with spaCy ---
def preprocess_text(text):
    """Cleans and preprocesses text using spaCy."""
    if not text:
        return ""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# --- Extract Entities ---
def extract_entities(text):
    """Extracts entities using spaCy NER."""
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# --- Main Pipeline ---
def main():
    # Step 1: Scrape URLs
    target_page_urls = get_all_page_urls_from_sitemap_index(SITEMAP_INDEX_URL)
    if not target_page_urls:
        logger.error("No URLs extracted. Exiting.")
        return

    # Step 2: Filter URLs by section
    filtered_urls = filter_urls_by_section(target_page_urls, SECTIONS_TO_KEEP, MAX_PAGES_PER_SECTION)
    logger.info(f"Filtered {len(filtered_urls)} URLs for processing.")

    # Step 3: Extract text
    trafilatura_results = extract_text_with_trafilatura(filtered_urls)
    documents = []
    for result in trafilatura_results:
        if result.get('content'):
            cleaned_content = preprocess_text(result['content'])
            entities = extract_entities(cleaned_content)
            result['cleaned_content'] = cleaned_content
            result['entities'] = entities
            documents.append(Document(page_content=cleaned_content, metadata={'source': result['url']}))
    logger.info(f"Prepared {len(documents)} documents for processing.")

    # Step 4: Extract graph data
    llm = Ollama(model="llama3.1", temperature=0)
    transformer = LLMGraphTransformer(llm=llm)
    logger.info(f"Extracting graph from {len(documents)} documents...")
    graph_documents = transformer.convert_to_graph_documents(documents)
    graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)
    logger.info("Graph construction complete.")


ModuleNotFoundError: No module named 'openai'

In [None]:

if __name__ == "__main__":
    if not os.path.exists(PDF_DOWNLOAD_DIR):
        os.makedirs(PDF_DOWNLOAD_DIR)
    main()

In [2]:
import json
import time
import logging
import os
from urllib.parse import urlparse
import requests
import xml.etree.ElementTree as ET
from tenacity import retry, stop_after_attempt, wait_exponential
import trafilatura
import spacy
from langchain_community.llms import Ollama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_neo4j import Neo4jGraph
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.robotparser import RobotFileParser
from tqdm import tqdm
import pdfplumber

# --- Configuration ---
SITEMAP_URLS = [
    "https://www.verizon.com/sitemap-index.xml",
    "https://sitemap.verizon.com/sitemap_index.xml",
    "https://www.verizon.com/content/wcms.sitemap-index.xml",
    "https://www.verizon.com/content/support.sitemap-index.xml",
    "https://community.verizon.com/sitemap.xml",
    "https://espanol.verizon.com/sitemap.xml",
    "https://mycareer.verizon.com/sitemap.xml",
    "https://www.verizon.com/business/learn-sitemap.xml",
    "https://www.verizon.com/business/answers/sitemap.xml",
    "https://www.verizon.com/business/answers-sitemap.xml",
    "https://www.verizon.com/business/locations/sitemap.xml",
    "https://www.verizon.com/business/resources/sitemap.xml",
    "https://www.verizon.com/business/shop/sitemap-prospect/smartphones.xml",
    "https://www.verizon.com/business/shop/sitemap-prospect/connected-devices.xml",
    "https://www.verizon.com/business/shop/sitemap-prospect/connected-smartwatches.xml",
    "https://www.verizon.com/business/shop/sitemap-prospect/tablets.xml",
    "https://www.verizon.com/business/shop/gridwallmap-prospect/smartphones.xml",
    "https://www.verizon.com/business/shop/sitemap-prospect/connected-laptops.xml",
    "https://www.verizon.com/business/shop/sitemap-prospect/broadbandaccess-devices.xml",
    "https://www.verizon.com/business/shop/sitemap-prospect.xml",
    "https://www.verizon.com/business/shop/sitemap-prospect/phones.xml",
    "https://www.verizon.com/sitemapls.xml"
]
BASE_DOMAIN = "www.verizon.com"
ABOUT_PATH_PREFIX = "/about/"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 (DissertationResearchBot/0.1)'
}
REQUEST_DELAY = 2
SECTIONS_TO_KEEP = ['investors', 'our-company', 'terms-conditions', 'responsibility', 'privacy', 'about', 'parenting', 'consumer-safety', 'international']
MAX_PAGES_PER_SECTION = 150
PDF_DOWNLOAD_DIR = "downloaded_verizon_about_pdfs"
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo4j_password"  # Change this!
TRAFILATURA_MARKDOWN = True

# --- Configuration ---
SITEMAP_INDEX_URL = "https://www.verizon.com/about/sitemap.xml"

EDGE_DRIVER_PATH = "E:/Programms(installed)/EdgeWebDriver/msedgedriver.exe"
SELENIUM_TIMEOUT = 20 # Seconds to wait for elements

REQUEST_DELAY = 2  # Seconds between requests

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, filename='kg_preparation.log', filemode='w')
logger = logging.getLogger(__name__)

# --- Initialize spaCy ---
nlp = spacy.load("en_core_web_sm")

# --- Initialize Neo4j ---
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

# --- Initialize Robots.txt Parser ---
robot_parser = RobotFileParser()
robot_parser.set_url("https://www.verizon.com/robots.txt")
robot_parser.read()

# --- Initialize Selenium for dynamic pages ---
def init_selenium_driver():
    service = Service(executable_path=EDGE_DRIVER_PATH)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    
    # options.add_argument(f'user-agent={HEADERS["User-Agent"]}')
    driver = webdriver.Edge(service=service, options=options)
    return  driver # webdriver.Chrome(options=options)

# --- Fetch Content with Retry and Selenium Fallback ---
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def fetch_url_content_requests(url, stream=False):
    """Fetches content from a URL with retry and error handling."""
    if not robot_parser.can_fetch(HEADERS['User-Agent'], url):
        logger.warning(f"URL disallowed by robots.txt: {url}")
        return None
    logger.info(f"Fetching: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=15, stream=stream)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url} with requests: {e}")
        # Try Selenium for dynamic content
        try:
            driver = init_selenium_driver()
            driver.get(url)
            time.sleep(2)  # Wait for page to load
            content = driver.page_source
            driver.quit()
            return type('obj', (object,), {'content': content, 'status_code': 200})
        except Exception as se:
            logger.error(f"Selenium failed for {url}: {se}")
            return None
    finally:
        if not stream:
            time.sleep(REQUEST_DELAY)

# --- Download PDF ---
def download_pdf(url, download_dir):
    """Downloads a PDF file to the specified directory."""
    if not robot_parser.can_fetch(HEADERS['User-Agent'], url):
        logger.warning(f"PDF URL disallowed by robots.txt: {url}")
        return None
    try:
        response = fetch_url_content_requests(url, stream=True)
        if response and response.status_code == 200:
            filename = os.path.join(download_dir, url.split('/')[-1])
            with open(filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            logger.info(f"Downloaded PDF: {filename}")
            return filename
        else:
            logger.error(f"Failed to download PDF: {url}")
            return None
    except Exception as e:
        logger.error(f"Error downloading PDF {url}: {e}")
        return None

# --- Extract Text from PDF ---
def extract_pdf_text(pdf_path):
    """Extracts text from a PDF file using pdfplumber."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
        return text
    except Exception as e:
        logger.error(f"Error extracting text from PDF {pdf_path}: {e}")
        return ""

# --- Parse Sitemap to Get URLs ---
def get_all_page_urls_from_sitemap_index(sitemap_urls):
    """Parses multiple sitemap index files to return page URLs."""
    all_page_urls = set()
    namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

    for sitemap_url in sitemap_urls:
        logger.info(f"Fetching sitemap: {sitemap_url}")
        response = fetch_url_content_requests(sitemap_url)
        if not (response and response.content):
            logger.error(f"Failed to fetch sitemap: {sitemap_url}")
            continue

        try:
            root = ET.fromstring(response.content)
            if root.tag == f"{{{namespace['sitemap']}}}sitemapindex":
                nested_sitemap_locs = [
                    s_loc.text for sitemap_tag in root.findall('sitemap:sitemap', namespace)
                    if (s_loc := sitemap_tag.find('sitemap:loc', namespace)) is not None and s_loc.text
                ]
                for nested_url in nested_sitemap_locs:
                    logger.info(f"  Fetching nested sitemap: {nested_url}")
                    nested_response = fetch_url_content_requests(nested_url)
                    if nested_response and nested_response.content:
                        try:
                            nested_root = ET.fromstring(nested_response.content)
                            if nested_root.tag == f"{{{namespace['sitemap']}}}urlset":
                                for url_element in nested_root.findall('sitemap:url', namespace):
                                    page_loc_element = url_element.find('sitemap:loc', namespace)
                                    if page_loc_element is not None and page_loc_element.text:
                                        all_page_urls.add(page_loc_element.text)
                        except ET.ParseError as e:
                            logger.error(f"Error parsing nested sitemap XML {nested_url}: {e}")
            elif root.tag == f"{{{namespace['sitemap']}}}urlset":
                for url_element in root.findall('sitemap:url', namespace):
                    page_loc_element = url_element.find('sitemap:loc', namespace)
                    if page_loc_element is not None and page_loc_element.text:
                        all_page_urls.add(page_loc_element.text)
            else:
                logger.error(f"Unknown root tag in sitemap {sitemap_url}: {root.tag}")
        except ET.ParseError as e:
            logger.error(f"Error parsing sitemap XML {sitemap_url}: {e}")

    return list(all_page_urls)

# --- Filter URLs by Section ---
def filter_urls_by_section(target_page_urls, sections, max_per_section):
    """Filters URLs by section and limits the number per section."""
    section_counts = {section: 0 for section in sections}
    filtered_urls = []
    pdf_urls = []

    for url in target_page_urls:
        if not robot_parser.can_fetch(HEADERS['User-Agent'], url):
            logger.warning(f"URL disallowed by robots.txt: {url}")
            continue
        parsed = urlparse(url)
        path = parsed.path
        if path.startswith(ABOUT_PATH_PREFIX):
            rest = path[len(ABOUT_PATH_PREFIX):]
            section = rest.split('/', 1)[0] if '/' in rest else rest
            if section in sections and section_counts[section] < max_per_section:
                filtered_urls.append(url)
                section_counts[section] += 1
        elif path.endswith('.pdf') and ('/about/files/' in path or '/about/file/' in path):
            pdf_urls.append(url)

    for section, count in section_counts.items():
        logger.info(f"{section}: {count} URLs")
    logger.info(f"Found {len(pdf_urls)} PDF URLs")
    return filtered_urls, pdf_urls

# --- Extract Text with Trafilatura ---
def extract_text_with_trafilatura(urls):
    """Extracts text from URLs using Trafilatura."""
    results = []
    for url in tqdm(urls, desc="Extracting text from HTML"):
        try:
            response = fetch_url_content_requests(url)
            if response and response.status_code == 200:
                extracted = trafilatura.extract(
                    response.content,
                    include_comments=False,
                    include_tables=True,
                    include_images=False,
                    include_links=False,
                    output_format="markdown" if TRAFILATURA_MARKDOWN else "txt"
                )
                results.append({'url': url, 'content': extracted, 'type': 'html'})
            else:
                results.append({'url': url, 'content': None, 'type': 'html'})
        except Exception as e:
            logger.error(f"Error processing {url}: {e}")
            results.append({'url': url, 'content': None, 'type': 'html'})
    return results

# --- Preprocess Text with spaCy ---
def preprocess_text(text):
    """Cleans and preprocesses text using spaCy."""
    if not text:
        return ""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# --- Extract Entities ---
def extract_entities(text):
    """Extracts entities using spaCy NER."""
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# --- Extract Cross-Document Relationships ---
def extract_cross_document_relationships(graph, llm, documents):
    """Infers relationships across documents using LLM prompts."""
    logger.info("\nExtracting cross-document relationships...")
    cross_doc_rels = []

    prompt_template = """
    Given a list of entities from different documents, identify potential relationships between them.
    Entities: {entities}
    Output format: {{"source": "entity1", "target": "entity2", "relationship": "type"}}
    """
    entities = set()
    for doc in documents:
        entities.update([ent[0] for ent in extract_entities(doc.page_content)])

    entities_list = list(entities)
    if len(entities_list) > 1:
        prompt = prompt_template.format(entities=", ".join(entities_list))
        try:
            response = llm.invoke(prompt)
            if response:
                try:
                    rels = json.loads(response)
                    cross_doc_rels.extend(rels)
                except json.JSONDecodeError:
                    logger.error("Error parsing LLM response for cross-document relationships.")
        except Exception as e:
            logger.error(f"Error invoking LLM for cross-document relationships: {e}")

    for rel in cross_doc_rels:
        query = (
            f"MERGE (s {{name: $source, type: 'Entity'}}) "
            f"MERGE (t {{name: $target, type: 'Entity'}}) "
            f"MERGE (s)-[r:{rel['relationship']}]->(t)"
        )
        graph.query(query, params={"source": rel["source"], "target": rel["target"]})

    logger.info("Cross-document relationships added to graph.")

# --- Enrich Graph ---
def enrich_graph(graph):
    """Enriches the graph by inferring additional relationships."""
    logger.info("\nEnriching graph with inferred relationships...")
    query = """
    MATCH (a)-[r1]->(b), (b)-[r2]->(c)
    WHERE NOT (a)-[:INFERRED]->(c)
    MERGE (a)-[:INFERRED {type: r1.type + '_' + r2.type}]->(c)
    """
    graph.query(query)
    logger.info("Graph enrichment complete.")


In [1]:
import torch
print(torch.version.cuda)
print(torch.cuda.is_available())

12.1
True


! TODO:  
* Use the LLM to do NER 

In [None]:

# --- Main Pipeline ---
def main():
    # Ensure PDF directory exists
    if not os.path.exists(PDF_DOWNLOAD_DIR):
        os.makedirs(PDF_DOWNLOAD_DIR)

    # Step 1: Scrape URLs
    target_page_urls = get_all_page_urls_from_sitemap_index(SITEMAP_URLS)
    if not target_page_urls:
        logger.error("No URLs extracted. Exiting.")
        return

    # Step 2: Filter URLs by section and robots.txt
    filtered_urls, pdf_urls = filter_urls_by_section(target_page_urls, SECTIONS_TO_KEEP, MAX_PAGES_PER_SECTION)
    logger.info(f"Filtered {len(filtered_urls)} HTML URLs and {len(pdf_urls)} PDF URLs for processing.")

    # Step 3: Download PDFs
    pdf_results = []
    for pdf_url in tqdm(pdf_urls, desc="Downloading PDFs"):
        pdf_path = download_pdf(pdf_url, PDF_DOWNLOAD_DIR)
        if pdf_path:
            text = extract_pdf_text(pdf_path)
            pdf_results.append({'url': pdf_url, 'content': text, 'type': 'pdf'})

    # Step 4: Extract text from HTML
    html_results = extract_text_with_trafilatura(filtered_urls)

    # Combine results
    trafilatura_results = html_results + pdf_results
    documents = []
    for result in trafilatura_results:
        if result.get('content'):
            cleaned_content = preprocess_text(result['content'])
            entities = extract_entities(cleaned_content)
            result['cleaned_content'] = cleaned_content
            result['entities'] = entities
            documents.append(Document(page_content=cleaned_content, metadata={'source': result['url'], 'type': result['type']}))
    logger.info(f"Prepared {len(documents)} documents for processing.")

    # Step 5: Extract graph data
    llm = Ollama(model="llama3.1", temperature=0)
    transformer = LLMGraphTransformer(llm=llm)
    logger.info(f"Extracting graph from {len(documents)} documents...")
    graph_documents = transformer.convert_to_graph_documents(documents)
    graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)
    logger.info("Initial graph construction complete.")

    # Step 6: Extract cross-document relationships
    extract_cross_document_relationships(graph, llm, documents)

    # Step 7: Enrich graph
    enrich_graph(graph)

# if __name__ == "__main__":
    # main()

AIzaSyAs3v1lR2P8oUwuOx1FKE3V2Z7JgwEIKAQ

## Now generating the Knowledge Graph

In [24]:
# import os
# import json
# from langchain_community.llms import Ollama
# # from langchain_experimental.graph_transformers import LLMGraphTransformer
# # from langchain_core.documents import Document

from langchain_ollama.llms import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from pydantic import BaseModel

class KGOut(BaseModel):
    nodes: list[dict]
    relationships: list[dict]

llm = ChatOllama(model="llama3.1", temperature=0, format=KGOut.schema())
structured_llm = llm.with_structured_output(KGOut, method="json_schema")

ModuleNotFoundError: No module named 'langchain_core'

In [25]:
from langchain_ollama import ChatOllama, OllamaLLM

In [27]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="llama3.2")
response = llm.invoke("Who discovered penicillin?")
print(response)


Penicillin was discovered by Scottish scientist Alexander Fleming in 1928. He observed that a mold, later identified as Penicillium notatum, had contaminated one of his bacterial cultures and had killed off the surrounding bacteria. This discovery led to the development of penicillin as a medicine, which revolutionized the treatment of bacterial infections.


In [28]:
# Best practice is to import from langchain_community
from langchain_community.llms import Ollama

try:
    # Use the EXACT name from the 'ollama list' command
    llm = Ollama(model="llama3.2")

    print("Successfully connected to local Ollama LLM.")
    print("Invoking model...")

    # Now, invoke the model
    response = llm.invoke("Who discovered penicillin?")
    
    print("\n--- Model Response ---")
    print(response)

except Exception as e:
    print("\n--- An Error Occurred ---")
    print("Error:", e)
    print("\nPlease ensure the Ollama application is running and you have pulled the model by running:")
    print("ollama pull llama3.1:8b-instruct")

  llm = Ollama(model="llama3.2")


Successfully connected to local Ollama LLM.
Invoking model...

--- Model Response ---
Penicillin was discovered by Scottish scientist Alexander Fleming in 1928. He observed that a mold, later identified as Penicillium notatum, had contaminated one of his bacterial cultures and was killing off the surrounding bacteria. This discovery led to the development of penicillin as an antibiotic and revolutionized the treatment of bacterial infections.


In [20]:
from langchain_ollama import ChatOllama
from pydantic import BaseModel

class KGSchema(BaseModel):
    nodes: list[dict]
    relationships: list[dict]

llm = ChatOllama(model="llama3.2", temperature=0)
structured_llm = llm.with_structured_output(KGSchema, method="json_schema")
print("✅ Ollama with structured output is ready.")

✅ Ollama with structured output is ready.


In [57]:
len(trafilatura_results)

20

In [None]:
# --- DISCOVERY SCRIPT ---
import json
from langchain_community.llms import Ollama
from langchain_ollama import ChatOllama, OllamaLLM
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document

# --- Use your scraped content for the 8-10 sample pages ---
# For this example, I'm using a placeholder.
# You would load the content for your sample URLs here.
# sample_documents_content = [
#     {"url": "sample_investor_page_url", "content": "CEO Hans Vestberg announced Q4 earnings..."},
#     {"url": "sample_responsibility_page_url", "content": "Our partnership with Clean Energy Inc. focuses on reducing carbon emissions..."},
#     {"url": "sample_privacy_page_url", "content": "The privacy policy governs the use of customer data, including location information..."},
#     # ... add content from your other sample pages
# ]

# --- Connect to your local LLM (Ollama) ---
try:
    llm = OllamaLLM(model="llama3.1", temperature=0)
    print("✅ Successfully connected to local Ollama LLM (llama3.1).")
except Exception as e:
    print(f"❌ Error connecting to Ollama: {e}")
    exit()

# --- KEY CHANGE: Run the transformer with NO constraints ---
# This allows the LLM to extract any type of node or relationship it finds.
unconstrained_transformer = LLMGraphTransformer(llm=llm)

# Convert your sample content into LangChain Document objects
documents_to_process = [
    Document(page_content=item['content'], metadata={'source': item['url']})
    for item in trafilatura_results if item.get('content')
]

# Extract the graph data
print(f"\n🧠 Starting UNCONSTRAINED graph extraction from {len(documents_to_process)} sample documents...")
graph_documents = unconstrained_transformer.convert_to_graph_documents(documents_to_process)
print("✅ Unconstrained extraction complete.")

# --- Collect all unique node and relationship types ---
discovered_node_types = set()
discovered_relationship_types = set()

for doc in graph_documents:
    for node in doc.nodes:
        discovered_node_types.add(node.type)
    for rel in doc.relationships:
        discovered_relationship_types.add(rel.type)

print("\n--- Discovered Schema Elements ---")
print("\nDiscovered Node Types:")
for node_type in sorted(list(discovered_node_types)):
    print(f"- {node_type}")

print("\nDiscovered Relationship Types:")
for rel_type in sorted(list(discovered_relationship_types)):
    print(f"- {rel_type}")

  llm = Ollama(model="llama3.1", temperature=0)


✅ Successfully connected to local Ollama LLM (llama3.1).

🧠 Starting UNCONSTRAINED graph extraction from 3 sample documents...
✅ Unconstrained extraction complete.

--- Discovered Schema Elements ---

Discovered Node Types:
- Characteristic
- Company
- Concept
- Content
- Event
- Group
- Infrastructure
- Initiative
- Innovation
- Location
- Network
- Organization
- Person
- Program
- Publication
- Service
- Statement
- Team
- Technology

Discovered Relationship Types:
- AFFECTS
- CREATE_CONTENT
- ENABLED_BY
- HELPS
- LOCATED_IN
- NEEDED_BY
- PROVIDES
- PROVIDES_SUPPORT
- PUBLISHED_IN
- RECOGNIZED_BY
- SPONSORED
- STATEMENT_MADE
- SUPPORTED_BY
- USED_BY
- WORKING_ON
- WORKS_FOR
- WORKS_WITH


In [22]:
discovered_node_types_list = list(discovered_node_types)
discovered_relationship_types_list = list(discovered_relationship_types)

In [None]:
import os
import json
from langchain_community.llms import Ollama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate # <-- IMPORT THE PROMPT TEMPLATE
from langchain_ollama import OllamaLLM

from pydantic import BaseModel

class KGSchema(BaseModel):
    nodes: list[dict]
    relationships: list[dict]

# --- This is the 'trafilatura_results' list from your previous script ---

# In your actual workflow, you would load this from a file or use the variable directly.


# --- Sub-task 1: Setup the Local LLM ---
# This code connects to the Ollama application you have running locally.
try:
    llm = OllamaLLM(model="llama3.1", temperature=0)
    print("✅ Successfully connected to local Ollama LLM (llama3.1).")

except Exception as e:
    print("❌ Error connecting to Ollama.")
    print("   Please ensure the Ollama application is running and you have pulled the 'llama3.1' model by running:")
    print("   ollama pull llama3.1")
    print(f"   Error details: {e}")
    exit()

# --- Sub-task 2: Define Your Knowledge Graph Schema ---


# --- Sub-task 3: Initialize the LangChain Graph Transformer ---

# --- FIX APPLIED HERE ---
# We create a PromptTemplate object from our string.
# The 'input' variable is what the LLMGraphTransformer will use to pass the text.
prompt_template_str = """
You are an expert knowledge graph engineer.
- Your task is to extract entities and their relationships from the provided text.
- Extract only the entities and relationships that match the predefined schema.
- Do not extract any information that is not explicitly mentioned in the text.
- Example:
    Text: "Elon Musk founded SpaceX. SpaceX is located in California."
    Output:
    {{
        "nodes": [
            {{"id": "Elon Musk", "type": "Person"}},
            {{"id": "SpaceX", "type": "Organization"}},
            {{"id": "California", "type": "Location"}}
        ],
        "relationships": [
            {{"source": "Elon Musk", "target": "SpaceX", "type": "FOUNDED"}},
            {{"source": "SpaceX", "target": "California", "type": "LOCATED_IN"}}
        ]
    }}

Text:
{input}
"""

# Create a PromptTemplate object
graph_prompt = PromptTemplate.from_template(prompt_template_str)


# # Now, initialize the transformer with the PromptTemplate object
# llm_transformer = LLMGraphTransformer(
#     llm=llm,
#     allowed_nodes=discovered_node_types_list,
#     allowed_relationships=discovered_relationship_types_list,
#     prompt=graph_prompt # Pass the PromptTemplate object here
# )

llm_transformer = LLMGraphTransformer(
    llm=llm,
    
    allowed_nodes=discovered_node_types_list,
    allowed_relationships=discovered_relationship_types_list,
    strict_mode=True
) #output_schema=KGSchema,

# --- Sub-task 4: Process Your Documents and Extract the Graph ---
documents_to_process = []
for result in trafilatura_results:
    if result.get('content'): 
        documents_to_process.append(
            Document(page_content=result['content'], metadata={'source': result['url']})
        )

print(f"\n🧠 Starting graph extraction from {len(documents_to_process)} documents...")
graph_documents = llm_transformer.convert_to_graph_documents(documents_to_process)
print("✅ Graph extraction complete.")


✅ Successfully connected to local Ollama LLM (llama3.1).

🧠 Starting graph extraction from 3 documents...
✅ Graph extraction complete.


In [24]:
for g in graph_documents:
    print("Nodes:", g.nodes)
    print("Relationships:", g.relationships)

Nodes: [Node(id='systems and people that you rely on every day', type='Infrastructure', properties={}), Node(id='people that you rely on every day', type='Group', properties={}), Node(id='entrepreneurs and businesses', type='Group', properties={}), Node(id='big, bold ideas quickly to market', type='Concept', properties={}), Node(id='emergency response teams', type='Team', properties={}), Node(id='5G Labs', type='Location', properties={})]
Relationships: [Relationship(source=Node(id='emergency response teams', type='Team', properties={}), target=Node(id='people that you rely on every day', type='Group', properties={}), type='PROVIDES_SUPPORT', properties={}), Relationship(source=Node(id='systems and people that you rely on every day', type='Infrastructure', properties={}), target=Node(id='emergency response teams', type='Team', properties={}), type='NEEDED_BY', properties={}), Relationship(source=Node(id='entrepreneurs and businesses', type='Group', properties={}), target=Node(id='big, 

In [None]:

# --- Sub-task 5: Aggregate, Inspect, and Save the Results ---
all_nodes = []
all_relationships = []
node_ids = set()

for doc in graph_documents:
    for node in doc.nodes:
        if node.id not in node_ids:
            all_nodes.append(node.dict())
            node_ids.add(node.id)
    for rel in doc.relationships:
        all_relationships.append(rel.dict())

output_file_path = "extracted_knowledge_graph.json"
with open(output_file_path, 'w', encoding='utf-8') as f:
    graph_data = {
        "nodes": all_nodes,
        "relationships": all_relationships
    }
    json.dump(graph_data, f, indent=2, ensure_ascii=False)

print("\n📊 Extracted Knowledge Graph Data:")
print(f"   - Total Nodes: {len(all_nodes)}")
print(f"   - Total Relationships: {len(all_relationships)}")
print(f"\n💾 Saved aggregated graph data to '{output_file_path}'")

print("\n--- Sample of Extracted Data ---")
print("Nodes Sample:", json.dumps(all_nodes[:3], indent=2))
print("Relationships Sample:", json.dumps(all_relationships[:3], indent=2))




📊 Extracted Knowledge Graph Data:
   - Total Nodes: 16
   - Total Relationships: 8

💾 Saved aggregated graph data to 'extracted_knowledge_graph.json'

--- Sample of Extracted Data ---
Nodes Sample: [
  {
    "id": "systems and people that you rely on every day",
    "type": "Infrastructure",
    "properties": {}
  },
  {
    "id": "people that you rely on every day",
    "type": "Group",
    "properties": {}
  },
  {
    "id": "entrepreneurs and businesses",
    "type": "Group",
    "properties": {}
  }
]
Relationships Sample: [
  {
    "source": {
      "id": "emergency response teams",
      "type": "Team",
      "properties": {}
    },
    "target": {
      "id": "people that you rely on every day",
      "type": "Group",
      "properties": {}
    },
    "type": "PROVIDES_SUPPORT",
    "properties": {}
  },
  {
    "source": {
      "id": "systems and people that you rely on every day",
      "type": "Infrastructure",
      "properties": {}
    },
    "target": {
      "id": "emer

C:\Users\shada\AppData\Local\Temp\ipykernel_27836\1569934168.py:9: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  all_nodes.append(node.dict())
C:\Users\shada\AppData\Local\Temp\ipykernel_27836\1569934168.py:12: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  all_relationships.append(rel.dict())


In [37]:
import os
import json
from langchain_community.llms import Ollama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document

# --- This is the 'trafilatura_results' list from your previous script ---
# For this example to be runnable, I'm creating a sample based on your description.
# In your actual workflow, you would load this from a file or use the variable directly.


# --- Sub-task 1: Setup the Local LLM ---
# This code connects to the Ollama application you have running locally.
# It's completely free to use.
try:
    llm = Ollama(model="llama3.1", temperature=0)
    print("✅ Successfully connected to local Ollama LLM (llama3.2).")
except Exception as e:
    print("❌ Error connecting to Ollama.")
    print("   Please ensure the Ollama application is running and you have pulled the 'llama3.2' model.")
    print(f"   Error details: {e}")
    # We'll exit here if the LLM isn't available, as the rest of the script depends on it.
    exit()

# --- Sub-task 2: Define Your Knowledge Graph Schema ---
# This is a critical step. You are telling the LLM exactly what kind of
# information to look for. You can and should customize this.
# allowed_nodes = ["Person", "Organization", "Product", "Technology", "Location", "Date", "Project"]
# allowed_relationships = [
#     "WORKS_AT", "CEO_OF", "FOUNDED", "PARTNERS_WITH", 
#     "LOCATED_IN", "ACQUIRED", "USES_TECHNOLOGY", 
#     "ANNOUNCED_PRODUCT", "PART_OF", "ANNOUNCED_ON"
# ]

# --- Sub-task 3: Initialize the LangChain Graph Transformer ---
# This tool orchestrates the extraction process. We provide it with our local LLM
# instance and the schema we want it to follow.
llm_transformer = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=discovered_node_types_list,
    allowed_relationships=discovered_relationship_types_list,
    # The prompt can be customized for more complex instructions
    prompt="""
    You are an expert knowledge graph engineer.
    - Your task is to extract entities and their relationships from the provided text.
    - Extract only the entities and relationships that match the predefined schema.
    - Do not extract any information that is not explicitly mentioned in the text.
    - Example:
        Text: "Elon Musk founded SpaceX. SpaceX is located in California."
        Output:
        {
            "nodes": [
                {"id": "Elon Musk", "type": "Person"},
                {"id": "SpaceX", "type": "Organization"},
                {"id": "California", "type": "Location"}
            ],
            "relationships": [
                {"source": "Elon Musk", "target": "SpaceX", "type": "FOUNDED"},
                {"source": "SpaceX", "target": "California", "type": "LOCATED_IN"}
            ]
        }
    """
)

# --- Sub-task 4: Process Your Documents and Extract the Graph ---
# We convert your scraped content into LangChain's Document format
# and then run the transformer.
documents_to_process = []
for result in trafilatura_results:
    if result.get('content'): # Process only if content exists
        documents_to_process.append(
            Document(page_content=result['content'], metadata={'source': result['url']})
        )

# The LLM does its work here. This may take some time depending on your hardware.
print(f"\n🧠 Starting graph extraction from {len(documents_to_process)} documents...")
graph_documents = llm_transformer.convert_to_graph_documents(documents_to_process)
print("✅ Graph extraction complete.")

# --- Sub-task 5: Aggregate, Inspect, and Save the Results ---
# We now have a list of 'GraphDocument' objects. Let's combine them into one
# master list of nodes and relationships, ready for the next phase.

all_nodes = []
all_relationships = []
node_ids = set()

for doc in graph_documents:
    for node in doc.nodes:
        # Simple entity resolution: if we've seen a node with this ID, skip it
        if node.id not in node_ids:
            all_nodes.append(node.dict())
            node_ids.add(node.id)
    for rel in doc.relationships:
        all_relationships.append(rel.dict())

# Save the aggregated graph data to a JSON file
# This file will be the input for our next script, which loads data into Neo4j.
output_file_path = "extracted_knowledge_graph.json"
with open(output_file_path, 'w', encoding='utf-8') as f:
    graph_data = {
        "nodes": all_nodes,
        "relationships": all_relationships
    }
    json.dump(graph_data, f, indent=2, ensure_ascii=False)

print(f"\n📊 Extracted Knowledge Graph Data:")

✅ Successfully connected to local Ollama LLM (llama3.2).


TypeError: Expected a Runnable, callable or dict.Instead got an unsupported type: <class 'str'>

### A simple Knowledge Graph builder

In [None]:
import os
import time
from fastapi import FastAPI, HTTPException
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
# from langchain_community.graphs import Neo4jGraph
from langchain_neo4j import Neo4jGraph
from langchain_community.chat_models import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.vectorstores import Neo4jVector
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain_experimental.llms.ollama_functions import OllamaFunctions

In [42]:
from langchain_neo4j import Neo4jGraph
graph = Neo4jGraph(
    url= "neo4j://127.0.0.1:7687" ,
    username="neo4j", #default
    password="neo4j_password" #change accordingly
)

In [18]:
# --- DISCOVERY SCRIPT ---
import json
from langchain_community.llms import Ollama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document

# --- Use your scraped content for the 8-10 sample pages ---
# For this example, I'm using a placeholder.
# You would load the content for your sample URLs here.
# sample_documents_content = [
#     {"url": "sample_investor_page_url", "content": "CEO Hans Vestberg announced Q4 earnings..."},
#     {"url": "sample_responsibility_page_url", "content": "Our partnership with Clean Energy Inc. focuses on reducing carbon emissions..."},
#     {"url": "sample_privacy_page_url", "content": "The privacy policy governs the use of customer data, including location information..."},
#     # ... add content from your other sample pages
# ]

# --- Connect to your local LLM (Ollama) ---
try:
    llm = Ollama(model="llama3.1", temperature=0)
    print("✅ Successfully connected to local Ollama LLM (llama3.1).")
except Exception as e:
    print(f"❌ Error connecting to Ollama: {e}")
    exit()

# --- KEY CHANGE: Run the transformer with NO constraints ---
# This allows the LLM to extract any type of node or relationship it finds.
unconstrained_transformer = LLMGraphTransformer(llm=llm)

# Convert your sample content into LangChain Document objects
documents_to_process = [
    Document(page_content=item['content'], metadata={'source': item['url']})
    for item in trafilatura_results if item.get('content')
]

# Extract the graph data
print(f"\n🧠 Starting UNCONSTRAINED graph extraction from {len(documents_to_process)} sample documents...")
graph_documents = unconstrained_transformer.convert_to_graph_documents(documents_to_process)
print("✅ Unconstrained extraction complete.")

# --- Collect all unique node and relationship types ---
discovered_node_types = set()
discovered_relationship_types = set()

for doc in graph_documents:
    for node in doc.nodes:
        discovered_node_types.add(node.type)
    for rel in doc.relationships:
        discovered_relationship_types.add(rel.type)

print("\n--- Discovered Schema Elements ---")
print("\nDiscovered Node Types:")
for node_type in sorted(list(discovered_node_types)):
    print(f"- {node_type}")

print("\nDiscovered Relationship Types:")
for rel_type in sorted(list(discovered_relationship_types)):
    print(f"- {rel_type}")

✅ Successfully connected to local Ollama LLM (llama3.1).

🧠 Starting UNCONSTRAINED graph extraction from 18 sample documents...


  llm = Ollama(model="llama3.1", temperature=0)


✅ Unconstrained extraction complete.

--- Discovered Schema Elements ---

Discovered Node Types:
- Analysis
- Audio
- City
- Companies
- Company
- Component
- Composition
- Conference
- Date
- Dividend Payment Methods
- Document
- ESG Indicators
- Earnings Results
- Email
- Emissions
- Entity
- Event
- Exposure
- Funding
- GHG Emissions
- Group
- Information
- Investment
- Issue
- Law
- Market
- Organization
- Person
- Plan
- Policies
- Policy
- Position
- Practice
- Practices
- Principles
- Product
- Program
- Publication
- Report
- Result
- State
- Statement
- Stock Exchange
- Strategy
- Symbol
- Time
- Transaction
- Use
- Website

Discovered Relationship Types:
- ADOPTS
- APPLIES_TO
- APPROVES_CAPITAL_PLAN
- CAN_RECEIVE_DIVIDENDS
- CAN_REQUEST_REPLACEMENT_STATEMENT
- CEO_OF
- CHAIRMAN_AND_CEO
- COMMISSIONED_BY
- CONTAINS
- CONTAINS_SAFE_HARBOR
- DESCRIBES
- EMPLOYER_OF
- EXECUTIVE
- FORMED_BY
- FUNDS_PENSION_OBLIGATIONS
- HAS_CAPITAL_NEEDS
- HAS_DERIVATIVES_POLICY
- HAS_INSURANCE_PR

In [None]:
# The function ingestion handles the conversion of this text into graph documents, which are then added to the Neo4j database:
def ingestion(text):

    # Convert the text into documents
    documents = [Document(page_content=text)]

    # Initialize the language model for text-to-graph conversion
    llm = ChatOllama(model="llama3", temperature=0)
    llm_transformer_filtered = LLMGraphTransformer(llm=llm)
    
    # Convert the text into graph documents
    graph_documents = llm_transformer_filtered.convert_to_graph_documents(documents)

    # Add the generated graph into Neo4j
    graph.add_graph_documents(
        graph_documents,
        baseEntityLabel=True,
        include_source=True
    )
    
    # Optional: Create embeddings for more complex search queries
    embed = OllamaEmbeddings(model="mxbai-embed-large")
    vector_index = Neo4jVector.from_existing_graph(
        embedding=embed,
        search_type="hybrid",
        node_label="Document",
        text_node_properties=["text"],
        embedding_node_property="embedding"
    )
    vector_retriever = vector_index.as_retriever()
    
    
    
    
def querying_neo4j(question):
     
    # Define a model for the extracted entities from the text
    class Entities(BaseModel):
        names: list[str] = Field(..., description="All entities from the text")

    # Define a prompt to extract entities from the input query
    prompt = ChatPromptTemplate.from_messages([ 
        ("system", "Extract organization and person entities from the text."),
        ("human", "Extract entities from: {question}")
    ])
    
    # Initialize the Ollama model for entity extraction with LLM (using "llama3")
    llm = OllamaFunctions(model="llama3", format="json", temperature=0)

    # Combine the prompt and LLM to create an entity extraction chain
    # The output is structured to match the "Entities" model
    entity_chain = prompt | llm.with_structured_output(Entities, include_raw=True)

    # Function to retrieve relationships of the extracted entities from Neo4j
    def graph_retriever(question: str) -> str:
        # Use the entity extraction chain to get entities from the question
        response = entity_chain.invoke({"question": question})
        # Extract the list of entity names from the response
        entities = response['raw'].tool_calls[0]['args']['properties']['names']
        print("Retreived Entities")
        print(entities)
        result = ""  # Initialize a variable to store the result

        # Iterate over each extracted entity
        for entity in entities:
            # Query Neo4j to get relationships for the given entity
            query_response = graph.query(
                """MATCH (p:Person {id: $entity})-[r]->(e)
                RETURN p.id AS source_id, type(r) AS relationship, e.id AS target_id
                LIMIT 50""",
                {"entity": entity}
            )
            # Format the query results and append to the result string
            result += "\n".join([f"{el['source_id']} - {el['relationship']} -> {el['target_id']}" for el in query_response])
        
        # Return the formatted results containing entity relationships
        return result
    
def querying_ollama(question):
    # Define a function that combines data retrieved from both Neo4j and vector embeddings
    def full_retriever(question: str):
        # Retrieve graph data for the question using the graph_retriever function
        graph_data = graph_retriever(question)
        print("Graph Data")
        print(graph_data)
        # Retrieve vector data by invoking the vector retriever with the question
        vector_data = [el.page_content for el in vector_retriever.invoke(question)]
        
        # Combine the graph data and vector data into a formatted string
        return f"Graph data: {graph_data}\nVector data: {'#Document '.join(vector_data)}"

    # Define a prompt template for generating a response based on context
    template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    Answer:"""
    
    # Create a prompt from the template, which takes the context and question as input
    prompt = ChatPromptTemplate.from_template(template)
    
    # Create a processing chain that:
    # 1. Generates context using the full_retriever function
    # 2. Passes through the question as-is using RunnablePassthrough
    # 3. Applies the prompt template to generate the final question
    # 4. Uses the LLM (language model) to generate the answer
    # 5. Uses StrOutputParser to format the output as a string
    chain = (
        {
            "context": lambda input: full_retriever(input),  # Generate context from the question
            "question": RunnablePassthrough(),  # Pass the question through without modification
        }
        | prompt  # Apply the prompt template
        | llm  # Use the language model to answer the question based on context
        | StrOutputParser()  # Parse the model's response as a string
    )

In [19]:
trafilatura_results

[{'url': 'https://www.verizon.com/about/investors/sellside-analyst-meeting',
  'content': '# Sellside Analyst Meeting\n\nVerizon is hosting a sellside analyst meeting with Verizon Chairman and CEO Lowell McAdam as well as Tim Armstrong, John Stratton, Hans Vestberg, Matt Ellis and other senior executives.\n\nNOTE: In this presentation we have made forward-looking statements. These statements are based on our estimates and assumptions and are subject to risks and uncertainties. Forward-looking statements include the information concerning our possible or assumed future results of operations. Forward-looking statements also include those preceded or followed by the words “anticipates,” “believes,” “estimates,” “expects,” “hopes” or similar expressions. For those statements, we claim the protection of the safe harbor for forward-looking statements contained in the Private Securities Litigation Reform Act of 1995. We undertake no obligation to revise or publicly release the results of any 

In [23]:
from langchain_core.documents import Document

# Convert trafilatura_results to a list of LangChain Document objects (only if 'content' exists)
trafilatura_documents = [
    Document(page_content=item['content'], metadata={'source': item['url']})
    for item in trafilatura_results if item.get('content')
]

In [26]:
scraped_data = [item['content'] for item in trafilatura_results if item.get('content')]

In [27]:
scraped_data

['# Sellside Analyst Meeting\n\nVerizon is hosting a sellside analyst meeting with Verizon Chairman and CEO Lowell McAdam as well as Tim Armstrong, John Stratton, Hans Vestberg, Matt Ellis and other senior executives.\n\nNOTE: In this presentation we have made forward-looking statements. These statements are based on our estimates and assumptions and are subject to risks and uncertainties. Forward-looking statements include the information concerning our possible or assumed future results of operations. Forward-looking statements also include those preceded or followed by the words “anticipates,” “believes,” “estimates,” “expects,” “hopes” or similar expressions. For those statements, we claim the protection of the safe harbor for forward-looking statements contained in the Private Securities Litigation Reform Act of 1995. We undertake no obligation to revise or publicly release the results of any revision to these forward-looking statements, except as required by law. Given these risk

In [None]:
# --- STEP 1: INGESTION SCRIPT ---
import json
from langchain_community.llms import Ollama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from neo4j import GraphDatabase

# --- Neo4j Connection Details ---
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo4j_password" # Change this!


# --- Load your scraped data ---
# (Assuming you have a file 'trafilatura_results.json' from your scraper)
# try:
#     with open('trafilatura_results.json', 'r', encoding='utf-8') as f:
#         scraped_data = json.load(f)
# except FileNotFoundError:
#     print("Error: 'trafilatura_results.json' not found. Please run your scraping script first.")
#     scraped_data = [item['content'] for item in trafilatura_results if item.get('content')]
    # exit()

# --- 1. Setup the Local LLM ---
llm = Ollama(model="llama3.1", temperature=0)

# --- 2. Define Your Final Schema (after discovery) ---
# allowed_nodes = ["Person", "Organization", "Product", "Technology", "Location", "Date", "Project"]
# allowed_relationships = ["WORKS_AT", "CEO_OF", "FOUNDED", "PARTNERS_WITH", "LOCATED_IN", "ACQUIRED", "USES_TECHNOLOGY", "ANNOUNCED_PRODUCT", "PART_OF", "ANNOUNCED_ON"]
allowed_nodes = list(discovered_node_types)
allowed_relationships = list(discovered_relationship_types)

# --- 3. Initialize the CONSTRAINED Graph Transformer ---
llm_transformer = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=allowed_nodes,
    allowed_relationships=allowed_relationships
)

# --- 4. Process All Documents to Extract Graph Data ---
documents_to_process = [
    Document(page_content=item['content'], metadata={'source': item['url']})
    for item in trafilatura_results if item.get('content')
]

print(f"🧠 Starting graph extraction from {len(documents_to_process)} documents...")
# This runs the LLM on all your documents. It will take time.
graph_documents = llm_transformer.convert_to_graph_documents(documents_to_process)
print("✅ Graph extraction complete.")


🧠 Starting graph extraction from 18 documents...
✅ Graph extraction complete.

📊 Aggregated Knowledge Graph Data:
   - Total Unique Nodes: 113
   - Total Relationships: 97

🧹 Clearing the Neo4j database...


C:\Users\shada\AppData\Local\Temp\ipykernel_6076\3354456785.py:59: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  all_nodes.append(node.dict())
C:\Users\shada\AppData\Local\Temp\ipykernel_6076\3354456785.py:62: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  all_relationships.append(rel.dict())


🚀 Uploading 113 nodes and 97 relationships...
✅ Batch upload complete.

🎉 Knowledge Graph construction and ingestion complete!


In [41]:

# --- 5. Aggregate All Nodes and Relationships ---
all_nodes = []
all_relationships = []
node_ids = set()

for doc in graph_documents:
    for node in doc.nodes:
        if node.id not in node_ids:
            all_nodes.append(node.dict())
            node_ids.add(node.id)
    for rel in doc.relationships:
        all_relationships.append(rel.dict())

print(f"\n📊 Aggregated Knowledge Graph Data:")
print(f"   - Total Unique Nodes: {len(all_nodes)}")
print(f"   - Total Relationships: {len(all_relationships)}")

# --- 6. Ingest into Neo4j (using the batch method) ---
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

with driver.session() as session:
    # Clear the database for a fresh start
    print("\n🧹 Clearing the Neo4j database...")
    session.run("MATCH (n) DETACH DELETE n")

    # Ingest all nodes and relationships in a single transaction
    print(f"🚀 Uploading {len(all_nodes)} nodes and {len(all_relationships)} relationships...")
    ingest_query = """
        UNWIND $nodes AS node_data
        MERGE (n:Entity {id: node_data.id})
        SET n.type = node_data.type

        WITH collect(n) AS all_nodes
        UNWIND $relationships AS rel_data
        MATCH (source:Entity {id: rel_data.source})
        MATCH (target:Entity {id: rel_data.target})
        CALL apoc.create.relationship(source, rel_data.type, {}, target) YIELD rel
        RETURN count(rel)
        """
    session.run(ingest_query, nodes=all_nodes, relationships=all_relationships)
    print("✅ Batch upload complete.")

driver.close()
print("\n🎉 Knowledge Graph construction and ingestion complete!")

C:\Users\shada\AppData\Local\Temp\ipykernel_6076\997666722.py:9: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  all_nodes.append(node.dict())
C:\Users\shada\AppData\Local\Temp\ipykernel_6076\997666722.py:12: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  all_relationships.append(rel.dict())



📊 Aggregated Knowledge Graph Data:
   - Total Unique Nodes: 113
   - Total Relationships: 97

🧹 Clearing the Neo4j database...
🚀 Uploading 113 nodes and 97 relationships...
✅ Batch upload complete.

🎉 Knowledge Graph construction and ingestion complete!


In [43]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [44]:
graph_documents[0]

GraphDocument(nodes=[Node(id='Matt Ellis', type='Person', properties={}), Node(id='Lowell McAdam', type='Person', properties={}), Node(id='Verizon', type='Company', properties={}), Node(id='Hans Vestberg', type='Person', properties={}), Node(id='John Stratton', type='Person', properties={}), Node(id='Tim Armstrong', type='Person', properties={}), Node(id='sellside analyst meeting', type='Event', properties={})], relationships=[Relationship(source=Node(id='Verizon', type='Company', properties={}), target=Node(id='sellside analyst meeting', type='Event', properties={}), type='HOSTS_MEETING', properties={}), Relationship(source=Node(id='Lowell McAdam', type='Person', properties={}), target=Node(id='Verizon', type='Company', properties={}), type='CHAIRMAN_AND_CEO', properties={}), Relationship(source=Node(id='Tim Armstrong', type='Person', properties={}), target=Node(id='Verizon', type='Company', properties={}), type='EXECUTIVE', properties={}), Relationship(source=Node(id='John Stratton',

In [45]:
from langchain_neo4j import Neo4jGraph

# Replace with your Neo4j connection details
# --- Neo4j Connection Details ---
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "neo4j_password" # Change this!

# Initialize the Neo4jGraph object
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD
)

# Example: Run a Cypher query
result = graph.query("MATCH (n) RETURN n LIMIT 5")
print(result)

[{'n': {'id': '8a3c504e5916a34e469f163e090be9a5', 'text': '# Sellside Analyst Meeting\n\nVerizon is hosting a sellside analyst meeting with Verizon Chairman and CEO Lowell McAdam as well as Tim Armstrong, John Stratton, Hans Vestberg, Matt Ellis and other senior executives.\n\nNOTE: In this presentation we have made forward-looking statements. These statements are based on our estimates and assumptions and are subject to risks and uncertainties. Forward-looking statements include the information concerning our possible or assumed future results of operations. Forward-looking statements also include those preceded or followed by the words “anticipates,” “believes,” “estimates,” “expects,” “hopes” or similar expressions. For those statements, we claim the protection of the safe harbor for forward-looking statements contained in the Private Securities Litigation Reform Act of 1995. We undertake no obligation to revise or publicly release the results of any revision to these forward-lookin

In [46]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
with driver.session() as session:
    # Count nodes
    node_count = session.run("MATCH (n:Entity) RETURN count(n) AS node_count").single()["node_count"]
    print(f"✅ Total nodes in Neo4j: {node_count}")

    # Count relationships (all types)
    rel_count = session.run("MATCH ()-[r]->() RETURN count(r) AS rel_count").single()["rel_count"]
    print(f"✅ Total relationships in Neo4j: {rel_count}")

✅ Total nodes in Neo4j: 116
✅ Total relationships in Neo4j: 228


In [51]:
with driver.session() as session:
    node_count = session.run("MATCH (n:Entity) RETURN count(n) AS node_count").single()["node_count"]
    rel_count = session.run("MATCH ()-[r]->() RETURN count(r) AS rel_count").single()["rel_count"]
    print(f"Nodes: {node_count}, Relationships: {rel_count}")

Nodes: 116, Relationships: 228


In [48]:
print("Nodes:", graph.query("MATCH (n) RETURN count(n)")[0]['count(n)'])
print("Relationships:", graph.query("MATCH ()-[r]->() RETURN count(r)")[0]['count(r)'])

Nodes: 244
Relationships: 228


### Visualize the Graph

In [50]:
import json
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
# This library is used to give nodes different colors based on their type
import webcolors 


# --- 1. Neo4j Connection Details ---

# --- 2. Function to Fetch Graph Data from Neo4j ---
def fetch_graph_from_neo4j(uri, user, password):
    """
    Connects to Neo4j and fetches all nodes and relationships.
    """
    nodes = []
    edges = []
    node_ids = set() # To keep track of nodes we've added

    try:
        driver = GraphDatabase.driver(uri, auth=(user, password))
        with driver.session() as session:
            # This query fetches all relationship paths in the graph
            result = session.run("MATCH (n)-[r]->(m) RETURN n, r, m")
            
            for record in result:
                # Extract nodes and relationship from the record
                node_n = record["n"]
                rel_r = record["r"]
                node_m = record["m"]

                # Add source node if we haven't seen it before
                if node_n.element_id not in node_ids:
                    nodes.append({
                        "id": node_n.element_id,
                        "properties": dict(node_n) # Convert node properties to a dict
                    })
                    node_ids.add(node_n.element_id)

                # Add target node if we haven't seen it before
                if node_m.element_id not in node_ids:
                    nodes.append({
                        "id": node_m.element_id,
                        "properties": dict(node_m)
                    })
                    node_ids.add(node_m.element_id)
                
                # Add the relationship
                edges.append({
                    "id": rel_r.element_id,
                    "start": node_n.element_id, # Source node ID
                    "end": node_m.element_id,   # Target node ID
                    "properties": dict(rel_r)   # Relationship properties
                })
        driver.close()
        print(f"✅ Fetched {len(nodes)} nodes and {len(edges)} relationships from Neo4j.")
        return {"nodes": nodes, "edges": edges}
    
    except Exception as e:
        print(f"❌ Error fetching graph from Neo4j: {e}")
        return None

# --- 3. Fetch the data ---
graph_data = fetch_graph_from_neo4j(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

# --- 4. Prepare and Visualize the Graph with yFiles ---
if graph_data:
    # Create the main GraphWidget
    w = GraphWidget()

    # Create a mapping from Node Type to a color for better visualization
    node_types = list(set([node['properties'].get('type', 'Unknown') for node in graph_data['nodes']]))
    # Generate some distinct colors
    colors = ['#FFD700', '#ADD8E6', '#90EE90', '#FFB6C1', '#FFA07A', '#20B2AA', '#87CEFA']
    color_map = {node_type: colors[i % len(colors)] for i, node_type in enumerate(node_types)}
    
    # Add nodes to the widget
    for node_data in graph_data['nodes']:
        node_id = node_data['id']
        node_props = node_data['properties']
        node_label = node_props.get('id', 'Unknown ID') # Use the 'id' property as the label
        node_type = node_props.get('type', 'Unknown')
        node_color = color_map.get(node_type, '#CCCCCC') # Default to gray
        
        w.add_node(node_id, label=node_label, color=node_color)

    # Add edges (relationships) to the widget
    for edge_data in graph_data['edges']:
        edge_id = edge_data['id']
        source_id = edge_data['start']
        target_id = edge_data['end']
        rel_props = edge_data['properties']
        rel_label = rel_props.get('type', 'RELATED_TO') # Use the 'type' property as the label
        
        w.add_edge(source_id, target_id, label=rel_label)

    # Display the interactive graph widget in your Jupyter Notebook
    # You can pan, zoom, and drag nodes around.
    print("\n🚀 Displaying interactive Knowledge Graph. This may take a moment to render.")
    display(w)

else:
    print("Could not visualize graph as no data was fetched.")


✅ Fetched 131 nodes and 228 relationships from Neo4j.


AttributeError: 'GraphWidget' object has no attribute 'add_node'

In [None]:
# Test the chain with a question
response = chain.invoke(input="Who are Marie Curie and Pierre Curie?")
print("Final Answer")
print(response)

# Knowledge Graph Discovery Framework

To convert the scraped data into a structured knowledge graph, we have created a comprehensive agent-based framework in the `kg_discovery` directory. This framework automates the process of converting unstructured text into a knowledge graph through the following components:

## Project Structure

```
kg_discovery/
├── __init__.py
├── config.py          # Configuration settings for the system
├── main.py            # Main entry point for the system
├── utils.py           # Utility functions
├── agents/            # Agent components
│   ├── __init__.py
│   ├── data_ingestion.py        # Agent for ingesting data from various sources
│   ├── information_extraction.py # Agent for extracting triples from text
│   ├── schema_discovery.py      # Agent for discovering and refining schema
│   ├── graph_construction.py    # Agent for constructing the knowledge graph
│   └── graph_enrichment.py      # Agent for enriching the graph with additional information
└── tools/             # Utility tools used by agents
    ├── __init__.py
    ├── web_scraper.py           # Tool for scraping web content
    ├── pdf_processor.py         # Tool for processing PDF documents
    ├── text_chunker.py          # Tool for splitting text into manageable chunks
    ├── networkx_handler.py      # Tool for NetworkX graph operations
    └── neo4j_handler.py         # Tool for Neo4j graph operations
```

## How It Works

1. **Data Ingestion**: The system ingests data from various sources including websites and PDF documents.
2. **Information Extraction**: It extracts structured information (triples) from the ingested text.
3. **Schema Discovery**: It discovers and refines the schema based on the extracted triples.
4. **Graph Construction**: It constructs a knowledge graph from the triples and schema.
5. **Graph Enrichment**: It enriches the graph with additional information.

## Usage

Run the system with the following command:

```bash
python -m kg_discovery.main --source https://www.verizon.com/about --use-neo4j
```

Parameters:
- `--source`: Source URL or path to start ingestion from
- `--depth`: Crawling depth for web sources (default: 2)
- `--output`: Output file path for the knowledge graph
- `--use-neo4j`: Store the knowledge graph in Neo4j instead of NetworkX
- `--verbose`: Enable verbose logging

## Prerequisites

- Python 3.8+
- Required packages: networkx, neo4j, langchain, spacy, trafilatura, selenium, pdfplumber, pymupdf, camelot
- Neo4j (optional, for graph database storage)
- Ollama with Llama 3.1 model (optional, for LLM-based extraction)