In [1]:
import requests
from urllib.parse import urljoin, quote
import os
import re
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd 

In [5]:
def download_image(url, file_name):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_name, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
                
def find_logo(soup):
    logo_identifiers = [
        {'class_': 'c-nav__logo'},  # The new class you found
        {'class_': 'nav-logo'},
        {'class_': 'header-logo'},
        {'class_': 'c-header__logo'},
        {'class_': 'site-logo'},
        {'class_': 'brand-logo'},
        {'class_': 'logo'},
        {'class_': 'menu-logo'},
        {'id': 'logo'}
    ]

    # Try to find the logo using the identifiers
    for identifier in logo_identifiers:
        logo = soup.find('img', **identifier)
        if logo:
            return logo.get('src') or logo.get('data-src')

    # If no logo found, try looking for SVG
    svg_logo = soup.find('svg', class_=lambda x: x and 'logo' in x.lower())
    if svg_logo:
        return svg_logo.find_parent('a').get('href')

    # If still no logo, try a more general approach
    all_images = soup.find_all('img')
    for img in all_images:
        src = img.get('src', '').lower()
        alt = img.get('alt', '').lower()
        if 'logo' in src or 'logo' in alt:
            return img.get('src') or img.get('data-src')

    # If no logo found at all
    return None

def scrape_logo(url, company_name):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        logo_url = find_logo(soup)
        
        if logo_url:
            logo_url = urljoin(url, logo_url)
            file_name = f"logos/{company_name}.png"
            download_image(logo_url, file_name)
            print(f"Downloaded logo for {company_name}")
        else:
            print(f"Could not find logo for {company_name}")
    except Exception as e:
        print(f"Error scraping {company_name}: {str(e)}")

In [8]:
# Create a directory to store logos
if not os.path.exists('missing logos'):
    os.makedirs('missing logos')

# Read the CSV file
with open(r'C:\Users\SchalkBurger\OneDrive - Viewpoint Ventures\2024\4. Box Automation\The Brain-Empty Logos.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row
    for row in reader:
        company_name = row[0]
        website = row[1]
        print(f"Scraping {company_name} from {website}")
        scrape_logo(website, company_name)
        time.sleep(1)  # Be polite to servers

Scraping AEZ from https://www.agencyez.com/
Error scraping AEZ: HTTPSConnectionPool(host='www.agencyez.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))
Scraping Agentero from https://agentero.com/
Could not find logo for Agentero
Scraping Air Parametric from https://airparametric.com/
Downloaded logo for Air Parametric
Scraping AlzBetter from https://alzbetter.net/
Downloaded logo for AlzBetter
Scraping Aper from https://www.aper.com/
Downloaded logo for Aper
Scraping Apriora from https://www.apriora.ai/
Downloaded logo for Apriora
Scraping ArgoX AI from https://www.argoxai.com/
Could not find logo for ArgoX AI
Scraping Arise Travel from https://www.arise.travel/
Could not find logo for Arise Travel
Scraping Ark Risk from https://www.arkrisk.io/solutions
Downloaded logo for Ark Risk
Scraping Aurora from https://aurora.tec

KeyboardInterrupt: 