In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import imagehash
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# 1. Citirea domeniilor din Parquet
df = pd.read_parquet('logos.snappy.parquet')
domains = df['domain'].tolist()  # Lista domeniilor din fișier

# 2. Funcție pentru extragerea logo-ului
def extract_logo(domain):
    try:
        url = f"https://{domain}" if not domain.startswith(('http://', 'https://')) else domain
        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Căutare logo în tag-uri comune
        logo_selectors = [
            'img[class*="logo"]', 
            'img[alt*="logo"]', 
            'img[src*="logo"]',
            'header img',
            'nav img'
        ]
        for selector in logo_selectors:
            logo = soup.select_one(selector)
            if logo and logo.get('src'):
                logo_url = logo['src'] if logo['src'].startswith('http') else f"{url.rstrip('/')}/{logo['src'].lstrip('/')}"
                img_data = requests.get(logo_url, timeout=10).content
                return Image.open(BytesIO(img_data))
        return None
    except Exception as e:
        print(f"Eroare la {domain}: {str(e)[:100]}...")
        return None

# 3. Calcularea hash-urilor
logo_hashes = {}
for domain in domains:
    img = extract_logo(domain)
    if img:
        logo_hashes[domain] = imagehash.phash(img)

# 4. Clusterizare
if len(logo_hashes) == 0:
    print("Nicio imagine extrasă.")
else:
    domains_with_hashes = list(logo_hashes.keys())
    hash_matrix = np.array([hash.hash.flatten() for hash in logo_hashes.values()])
    distance_matrix = np.array([[np.sum(h1 != h2) for h1 in hash_matrix] for h2 in hash_matrix])

    # Clusterizare hierarhică
    cluster = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=8,  # Ajustează pentru precizie
        metric='precomputed',
        linkage='single'
    ).fit(distance_matrix)

    # Gruparea rezultatelor
    groups = {}
    for idx, label in enumerate(cluster.labels_):
        groups.setdefault(label, []).append(domains_with_hashes[idx])

    print("Grupuri de domenii cu logo-uri similare:")
    for group in groups.values():
        print(f"- {group}")