In [21]:
import pandas as pd
import requests
import os


In [22]:
# Load CSV with logo URLs
csv_path = "dataset/logos_with_urls.csv"
df = pd.read_csv(csv_path)

# Create a folder to store images
output_folder = "logos"
os.makedirs(output_folder, exist_ok=True)

In [None]:

# Function to download logos, checking if they already exist
def download_logo(row):
    domain = row["domain"].replace(".", "_")  # Replace dots with underscores for filenames
    logo_path = os.path.join(output_folder, f"{domain}.png")

    # If logo already exists, skip downloading
    if os.path.exists(logo_path):
        print(f"✅ {domain}: Logo already exists, skipping download.")
        return logo_path  # Return existing file path

    # Define Clearbit and Google Favicon URLs
    clearbit_url = f"https://logo.clearbit.com/{row['domain']}"
    google_favicon_url = f"https://www.google.com/s2/favicons?domain={row['domain']}&sz=128"

    # Try downloading from Clearbit
    try:
        response = requests.get(clearbit_url, timeout=5)
        if response.status_code == 200:
            with open(logo_path, "wb") as file:
                file.write(response.content)
            print(f"✅ {domain}: Downloaded from Clearbit")
            return logo_path  # Return saved file path
        else:
            print(f"⚠️ {domain}: Clearbit 404, trying Google Favicon...")
    except requests.RequestException as e:
        print(f"❌ {domain}: Clearbit error - {e}")

    # Try downloading from Google Favicon as fallback
    try:
        response = requests.get(google_favicon_url, timeout=5)
        if response.status_code == 200:
            with open(logo_path, "wb") as file:
                file.write(response.content)
            print(f"✅ {domain}: Downloaded from Google Favicon")
            return logo_path
        else:
            print(f"❌ {domain}: No logo found on Google Favicon")
    except requests.RequestException as e:
        print(f"❌ {domain}: Google Favicon error - {e}")

    return None  # Return None if both attempts fail

# Download all logos and store paths
df["saved_path"] = df.apply(download_logo, axis=1)

# Save updated CSV with file paths
df.to_csv("dataset/logos_with_paths.csv", index=False)

print("\n🎉 ✅ All logos processed! Check 'logos_with_paths.csv' for results.")


✅ stanbicbank_co_zw: Downloaded from Clearbit
✅ astrazeneca_ua: Downloaded from Clearbit
✅ autosecuritas-ct-seysses_fr: Downloaded from Clearbit
✅ ovb_ro: Downloaded from Clearbit
✅ mazda-autohaus-hellwig-hoyerswerda_de: Downloaded from Clearbit
✅ toyota-buchreiter-eisenstadt_at: Downloaded from Clearbit
✅ ebay_cn: Downloaded from Clearbit
✅ greatplacetowork_com_bo: Downloaded from Clearbit
✅ wurth-international_com: Downloaded from Clearbit
⚠️ plameco-hannover_de: Clearbit 404, trying Google Favicon...
✅ plameco-hannover_de: Downloaded from Google Favicon
⚠️ kia-moeller-wunstorf_de: Clearbit 404, trying Google Favicon...
✅ kia-moeller-wunstorf_de: Downloaded from Google Favicon
✅ ccusa_co_nz: Downloaded from Clearbit
✅ tupperware_at: Downloaded from Clearbit
⚠️ zalando_cz: Clearbit 404, trying Google Favicon...
✅ zalando_cz: Downloaded from Google Favicon
⚠️ crocs_com_uy: Clearbit 404, trying Google Favicon...
✅ crocs_com_uy: Downloaded from Google Favicon
✅ ymcasteuben_org: Downloade