In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import json
import google.generativeai as genai
import re
from urllib.parse import urljoin, urlparse


genai.configure(api_key="AIzaSyBvuAQh_QY6i8yJ636Fm-NM8k8nGM3WQTg")  


websites = [
    "https://www.snap.com",
    "https://www.dropbox.com",
    "https://www.tesla.com",
    "https://www.spacex.com",
    "https://robinhood.com",
    "https://stripe.com",
    "https://squareup.com",
    "https://www.shopify.com",
    "https://www.zara.com",
    "https://hmgroup.com"
]

HEADERS = {"User-Agent": "Mozilla/5.0"}


chrome_options = Options()
chrome_options.add_argument("--headless")

def get_selenium_driver():
    return webdriver.Chrome(service=Service("/path/to/chromedriver"), options=chrome_options)

def get_useful_routes(url):
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    keywords = ["about", "company", "mission", "vision", "history", "leadership", "values", "team", "founders", "executives", "awards", "recognition","our","contact","services","products"]
    useful_paths = set()

    
    for keyword in keywords:
        test_url = f"{base_url}/{keyword}"
        try:
            response = requests.head(test_url, headers=HEADERS, timeout=5)
            if response.status_code in [200, 301, 302]:
                useful_paths.add(test_url)
        except requests.RequestException:
            pass

    
    soup = scrape_using_requests(url) or scrape_using_selenium(url)
    if soup:
        for link in soup.find_all("a", href=True):
            href = link["href"].lower()
            if any(keyword in href for keyword in keywords):
                full_url = urljoin(base_url, href)
                useful_paths.add(full_url)

    return list(useful_paths)

def scrape_using_requests(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code == 200:
            return BeautifulSoup(response.text, "html.parser")
    except Exception:
        return None

def scrape_using_selenium(url):
    try:
        driver = get_selenium_driver()
        driver.get(url)
        time.sleep(5)  
        soup = BeautifulSoup(driver.page_source, "html.parser")
        driver.quit()
        return soup
    except Exception:
        return None

def extract_text(soup):
    text = []
    for tag in soup.find_all(["h1", "h2", "p", "li"]):
        clean_text = tag.get_text(strip=True)
        if len(clean_text) > 30:
            text.append(clean_text)
    return " ".join(text)[:6000]  

def clean_gemini_response(response_text):
    return re.sub(r"```json|```", "", response_text).strip()

def extract_details_from_gemini(text, company_name):
    model = genai.GenerativeModel("gemini-2.0-flash")
    prompt = f"""
    Extract the following details about '{company_name}' from the given text.
    If any detail is missing, return "Not found".
    
    Required JSON format:
    {{
      "mission_statement": "...",
      "core_values": "...",
      "products_services": "...",
      "founding_year": "...",
      "founders": "...",
      "headquarters_location": "...",
      "key_executives": "...",
      "notable_awards_recognitions": "..."
    }}

    Text:
    {text}

    Output ONLY valid JSON, no extra text or formatting.
    """
    try:
        response = model.generate_content(prompt)
        cleaned_response = clean_gemini_response(response.text)
        return cleaned_response  
    except Exception as e:
        return f"Error extracting details: {str(e)}"

data = []
for site in websites:
    print(f"Scraping {site}...")

    
    soup = scrape_using_requests(site) or scrape_using_selenium(site)
    content = extract_text(soup) if soup else None

    
    extra_pages = get_useful_routes(site)
    for page in extra_pages:
        print(f"Scraping extra page: {page}")
        soup = scrape_using_requests(page) or scrape_using_selenium(page)
        if soup:
            page_content = extract_text(soup)
            if page_content:
                content = content + " " + page_content if content else page_content

    
    if content:
        details = extract_details_from_gemini(content, site)
        try:
            details_json = json.loads(details)
            details_json = {k: v for k, v in details_json.items() if v != "Not found"}
        except json.JSONDecodeError:
            print(f"Invalid JSON response for {site}: {details}")
            details_json = {}

        data.append([site, json.dumps(details_json, indent=4)])
        print(f"Extracted details for {site}.")
    else:
        print(f"Failed to scrape {site}. No LLM-generated details will be included.")
        data.append([site, "{}"])


df = pd.DataFrame(data, columns=["Website", "Extracted Details"])
df.to_csv("results.csv", index=False)
print("Data saved to results.csv")

json_data = [{"Website": site, "Extracted Details": json.loads(details)} for site, details in data]
with open("results.json", "w", encoding="utf-8") as json_file:
    json.dump(json_data, json_file, indent=4, ensure_ascii=False)
print("Data saved to results.json")


  from .autonotebook import tqdm as notebook_tqdm


Scraping https://www.snap.com...
Scraping extra page: https://www.snap.com/founders
Scraping extra page: https://www.snap.com/mission
Scraping extra page: https://www.snap.com/contact
Scraping extra page: https://www.snap.com/policies?utm_source=snap_com&utm_medium=referral&utm_campaign=universal_navigation&utm_content=footer_item_link
Scraping extra page: https://www.snap.com/awards
Scraping extra page: https://values.snap.com/?lang=en-us
Scraping extra page: https://www.snap.com/vision
Scraping extra page: https://www.snap.com/services
Scraping extra page: https://www.snap.com/executives
Scraping extra page: https://values.snap.com/privacy/privacy-policy?utm_source=snap_com&utm_medium=referral&utm_campaign=universal_navigation&utm_content=footer_item_link&lang=en-us
Scraping extra page: https://www.snap.com/products
Scraping extra page: https://citizen.snap.com/?utm_source=snap_com&utm_medium=referral&utm_campaign=universal_navigation&utm_content=footer_item_link&lang=en-us
Scraping 