In [3]:
import pandas as pd
df = pd.read_parquet("logos.snappy.parquet")
df.to_csv("logos.csv", index=False)

In [1]:
import pandas as pd
import requests
from tqdm import tqdm

# Încarc domeniile
df = pd.read_csv("logos.csv")

# Costruiesc URL favicon
def check_favicon(domain):
    try:
        favicon_url = f"https://www.google.com/s2/favicons?sz=64&domain={domain}"
        r = requests.get(favicon_url, timeout=5)
        if r.status_code == 200 and r.content:
            return favicon_url
    except Exception:
        pass
    return None

tqdm.pandas()
df["logo_url"] = df["domain"].progress_apply(check_favicon)

# Salvare
df[["domain", "logo_url"]].to_csv("favicons_result.csv", index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 4384/4384 [18:50<00:00,  3.88it/s]


In [27]:
import pandas as pd

df = pd.read_csv("favicons_result.csv")

# Număr total
total = len(df)

# Număr de favicon-uri găsite (logo_url nu e NaN)
found = df["logo_url"].notna().sum()

# Afișează
print(f"{found} din {total} domenii au logo_url ({100 * found / total:.2f}%)")

4011 din 4384 domenii au logo_url (91.49%)


In [29]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm
import re

# Verifică dacă un URL este valid
def is_valid_logo_url(url):
    return bool(url)

# Verific daca un tag HTML are "logo" in el
def is_likely_logo(tag):
    attributes_to_check = ["src", "srcset", "alt", "id", "data-src", "data-original", "data-lazy"]
    for attr in attributes_to_check:
        val = tag.get(attr)
        if val and "logo" in val.lower():
            return True

    # Verific fiecare clasă individual
    class_list = tag.get("class")
    if class_list:
        for cls in class_list:
            if "logo" in cls.lower():
                return True

    return False

# Extrage logo-ul dintr-un HTML static
def extract_logo_from_html(base_url, html):
    soup = BeautifulSoup(html, "html.parser")

    # Caută <img> care pare "logo"
    for img in soup.find_all("img"):
        if is_likely_logo(img):
            src = img.get("src") or img.get("srcset")
            full_url = urljoin(base_url, src) if src else None
            if is_valid_logo_url(full_url):
                return full_url

    # Caută <source> (webp etc.)
    for source in soup.find_all("source"):
        if is_likely_logo(source):
            srcset = source.get("srcset")
            full_url = urljoin(base_url, srcset) if srcset else None
            if is_valid_logo_url(full_url):
                return full_url

    # Caută background-image: url(...logo...)
    for tag in soup.find_all(style=True):
        style = tag.get("style", "").lower()
        if "background-image" in style and "logo" in style:
            match = re.search(r"url\((.*?)\)", style)
            if match:
                raw_url = match.group(1).strip('\'"')
                full_url = urljoin(base_url, raw_url)
                if is_valid_logo_url(full_url):
                    return full_url

    # SVG use xlink:href
    for use in soup.find_all("use"):
        for attr in ["xlink:href", "href"]:
            href = use.get(attr)
            if href and "logo" in href.lower():
                full_url = urljoin(base_url, href)
                if is_valid_logo_url(full_url):
                    return full_url

    # fallback: caută url(...logo...) brut în HTML
    raw_html = soup.prettify().lower()
    matches = re.findall(r"url\((.*?)\)", raw_html)
    for m in matches:
        if "logo" in m:
            full_url = urljoin(base_url, m.strip('\'"'))
            if is_valid_logo_url(full_url):
                return full_url

    return None

# Încearc site-ul simplu și apoi cu www.
def extract_logo_smart(domain):
    attempts = [f"https://{domain}"]
    if not domain.startswith("www."):
        attempts.append(f"https://www.{domain}")

    for base_url in attempts:
        try:
            r = requests.get(base_url, timeout=12, headers={"User-Agent": "Mozilla/5.0"})
            if r.status_code == 200:
                logo_url = extract_logo_from_html(base_url, r.text)
                if logo_url:
                    return logo_url
        except Exception:
            continue

    return None

# Aplic pe toate domeniile din fișierul CSV
df = pd.read_csv("logos.csv")
tqdm.pandas()
df["logo_url"] = df["domain"].progress_apply(extract_logo_smart)

# Salvare
df.to_csv("logos_detected_HTML.csv", index=False)

100%|████████████████████████████████████████████████████████████████████████████| 4384/4384 [2:15:59<00:00,  1.86s/it]


In [30]:
import pandas as pd

df = pd.read_csv("logos_detected_HTML.csv")

# Număr total
total = len(df)

# Număr de favicon-uri găsite (logo_url nu e NaN)
found = df["logo_url"].notna().sum()

# Afișează
print(f"{found} din {total} domenii au logo_url ({100 * found / total:.2f}%)")

3333 din 4384 domenii au logo_url (76.03%)


In [31]:
import pandas as pd

favicons_df = pd.read_csv("favicons_result.csv")
logos_df = pd.read_csv("logos_detected_smart10.csv")

favicons_df["domain"] = favicons_df["domain"].str.strip().str.lower()
logos_df["domain"] = logos_df["domain"].str.strip().str.lower()

# Păstrează coloana 'domain' din favicons și completează 'logo_url' unde lipsește
final_df = favicons_df.copy()
final_df["logo_url"] = final_df["logo_url"].fillna(logos_df["logo_url"])

# Verificare
print(f"Total rânduri: {len(final_df)}")
print(f"Domenii unice: {final_df['domain'].nunique()}")
print(f"Logo-uri găsite: {final_df['logo_url'].notna().sum()}")

# Salvare
final_df.to_csv("logos_finals.csv", index=False)

Total rânduri: 4384
Domenii unice: 3416
Logo-uri găsite: 4259


In [32]:
import pandas as pd

df = pd.read_csv("logos_finals.csv")

# Număr total
total = len(df)

# Număr de favicon-uri găsite (logo_url nu e NaN)
found = df["logo_url"].notna().sum()

# Afișează
print(f"{found} din {total} domenii au logo_url ({100 * found / total:.2f}%)")

4259 din 4384 domenii au logo_url (97.15%)


In [34]:
import os
import pandas as pd
import requests
from PIL import Image, UnidentifiedImageError
from io import BytesIO
from urllib.parse import urlparse
from tqdm import tqdm
import hashlib
import cairosvg
import base64

# Config
CSV_FILE = "logos_finals.csv"
FOLDER = "downloaded_logos_all"
FAILED_CSV = "failed_logos.csv"
CORRUPT_FOLDER = "corrupt_images"

# Foldere
os.makedirs(FOLDER, exist_ok=True)
os.makedirs(CORRUPT_FOLDER, exist_ok=True)

# Datele
df = pd.read_csv(CSV_FILE)
df = df.dropna(subset=["logo_url", "domain"])
df = df[df["logo_url"].str.startswith(("http", "data:image/"))]

# Eșecuri
failed = []

def get_extension(content_type, fallback=".png"):
    if "svg" in content_type:
        return ".svg"
    elif "jpeg" in content_type:
        return ".jpg"
    elif "png" in content_type:
        return ".png"
    elif "gif" in content_type:
        return ".gif"
    else:
        return fallback

def generate_filename(domain, url, index, ext):
    hashed = hashlib.md5(url.encode()).hexdigest()[:8]
    name = f"{domain}__{hashed}_{index}{ext}"
    return name.replace("/", "_").replace("\\", "_")

def save_corrupt_image(content, filename):
    path = os.path.join(CORRUPT_FOLDER, filename)
    with open(path, "wb") as f:
        f.write(content)

def download_image(domain, url, index):
    try:
        # Base64
        if url.startswith("data:image"):
            header, b64data = url.split(",", 1)
            ext = ".png" if "png" in header else ".jpg"
            raw_data = base64.b64decode(b64data)
            filename = generate_filename(domain, url, index, ext)
            path = os.path.join(FOLDER, filename)
            with open(path, "wb") as f:
                f.write(raw_data)
            return True, filename

        # HTTP/HTTPS standard
        r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"}, verify=False)
        if r.status_code != 200:
            return False, f"status {r.status_code}"

        content_type = r.headers.get("Content-Type", "")
        ext = get_extension(content_type)
        filename = generate_filename(domain, url, index, ext)
        save_path = os.path.join(FOLDER, filename)

        if ext == ".svg":
            png_path = save_path.replace(".svg", ".png")
            cairosvg.svg2png(bytestring=r.content, write_to=png_path)
            return True, png_path
        else:
            try:
                image = Image.open(BytesIO(r.content))
                if image.mode in ("RGBA", "P"):
                    image = image.convert("RGBA")
                    image.save(save_path, format="PNG")
                else:
                    image = image.convert("RGB")
                    image.save(save_path)
                return True, filename
            except UnidentifiedImageError:
                # Salvează fișierul brut pentru analiză
                bin_name = filename.replace(ext, ".bin")
                save_corrupt_image(r.content, bin_name)
                return False, "cannot identify image (saved as .bin)"

    except Exception as e:
        return False, str(e)

# Procesare
print(f"Total rânduri de procesat: {len(df)}")
success_count = 0

for index, row in tqdm(df.iterrows(), total=len(df)):
    domain = str(row["domain"]).strip().lower()
    url = str(row["logo_url"]).strip()
    ok, result = download_image(domain, url, index)
    if ok:
        success_count += 1
    else:
        failed.append({"domain": domain, "logo_url": url, "error": result})

# Salvare
if failed:
    pd.DataFrame(failed).to_csv(FAILED_CSV, index=False)

print(f"\n Salvate: {success_count} din {len(df)}")
print(f" Eșecuri salvate în: {FAILED_CSV}")
print(f" Imagini salvate în: {FOLDER}")
print(f" Fișiere corupte salvate în: {CORRUPT_FOLDER}")

Total rânduri de procesat: 4259


100%|██████████████████████████████████████████████████████████████████████████████| 4259/4259 [22:53<00:00,  3.10it/s]


 Salvate: 4238 din 4259
 Eșecuri salvate în: failed_logos.csv
 Imagini salvate în: downloaded_logos_all
 Fișiere corupte salvate în: corrupt_images





In [35]:
import os
import imagehash
from PIL import Image
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

# Config
FOLDER = "downloaded_logos_all"
HASH_THRESHOLD = 5
CSV_FILE = "logos_finals.csv"

# Hash-iesc imaginile
hash_to_logos = defaultdict(list)

for filename in tqdm(os.listdir(FOLDER), desc="Hashing imagini"):
    if not filename.lower().endswith((".png", ".jpg", ".jpeg")):
        continue
    path = os.path.join(FOLDER, filename)
    try:
        img = Image.open(path).convert("RGB")
        phash = imagehash.phash(img)
        domain = filename.split("__")[0]
        hash_to_logos[str(phash)].append((domain, filename))
    except Exception:
        continue

# Grupez pe baza distanței hash
visited = set()
groups = []

hashes = list(hash_to_logos.keys())
for i, h1 in enumerate(hashes):
    if h1 in visited:
        continue
    group = set(hash_to_logos[h1])
    visited.add(h1)
    for j in range(i + 1, len(hashes)):
        h2 = hashes[j]
        if h2 in visited:
            continue
        dist = imagehash.hex_to_hash(h1) - imagehash.hex_to_hash(h2)
        if dist <= HASH_THRESHOLD:
            group.update(hash_to_logos[h2])
            visited.add(h2)
    groups.append(sorted(list(group)))

# Încarc logo_url din CSV-ul inițial
df_meta = pd.read_csv(CSV_FILE)
df_meta["domain"] = df_meta["domain"].str.strip().str.lower()
domain_to_url = dict(zip(df_meta["domain"], df_meta["logo_url"]))

# DataFrame-ul rezultat
data = []
for i, group in enumerate(groups):
    for domain, fname in group:
        data.append({
            "group_id": i + 1,
            "domain": domain,
            "filename": fname,
            "logo_url": domain_to_url.get(domain, "")
        })

df_groups = pd.DataFrame(data)

# Sortare după group_id și domain
df_groups = df_groups.sort_values(by=["group_id", "domain"])

# Salvare
df_groups.to_csv("logo_similarity_groups.csv", index=False)

Hashing imagini: 100%|████████████████████████████████████████████████████████████| 4240/4240 [00:07<00:00, 590.23it/s]
