Req. packages - for online ipynb's


In [None]:
!pip install imagehash
!pip install cairosvg
!pip install filetype
!pip install requests-toolbelt
!pip install urllib3

Put all the domains from parquet to txt

In [None]:
import pandas as pd

def extract_domains_from_parquet(parquet_path, output_file='domains.txt'):
    df = pd.read_parquet(parquet_path)

    if 'domain' not in df.columns:
        raise ValueError("The Parquet file does not contain a 'domain' column.")

    domains = df['domain'].dropna().tolist()#extract domain values-drop NaN entries-convert to list

    with open(output_file, 'w') as f: #save 2 output file
        for domain in domains:
            f.write(f"{domain}\n")

    print(f"Successfully extracted {len(domains)} domains and saved to {output_file}.")

if __name__ == "__main__":
    extract_domains_from_parquet('logos.snappy.parquet', 'domains.txt')

Successfully extracted 4384 domains and saved to domains.txt.


Extracting Logos from the domains ~4384 domains found in the parquet file
FOUND - 2690 logos parsed

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pathlib import Path

def get_logo_url(domain):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        for scheme in ['https://', 'http://']: #searched for all the types of logos
            try:
                url = scheme + domain
                response = requests.get(url, headers=headers, timeout=10)
                soup = BeautifulSoup(response.content, 'html.parser')

                og_image = soup.find('meta', property='og:image')
                if og_image and og_image.get('content'):
                    return urljoin(url, og_image.get('content'))

                twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
                if twitter_image and twitter_image.get('content'):
                    return urljoin(url, twitter_image.get('content'))

                logo_selectors = [
                    ('link', {'rel': ['icon', 'shortcut icon', 'apple-touch-icon']}),
                    ('img', {'class': 'logo'}),
                    ('img', {'id': 'logo'}),
                    ('img', {'alt': 'logo'})
                ]

                for tag, attrs in logo_selectors:
                    element = soup.find(tag, attrs)
                    if element and element.get('href' if tag == 'link' else 'src'):
                        return urljoin(url, element.get('href' if tag == 'link' else 'src'))

            except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
                continue

    except Exception as e:
        print(f"Error processing {domain}: {str(e)}")
        return None

    return None

def download_logo(domain, output_dir="logos"): #searching for the logo files in the site's header
    logo_url = get_logo_url(domain)
    if not logo_url:
        return False

    try:
        response = requests.get(logo_url, stream=True, timeout=10)
        if response.status_code == 200:
            Path(output_dir).mkdir(parents=True, exist_ok=True)
            content_type = response.headers.get('content-type', '').split('/')[-1]
            ext = content_type if content_type in ['png', 'jpeg', 'jpg', 'svg+xml', 'gif'] else \
                logo_url.split('.')[-1].split('?')[0][:4]
            filename = f"{domain.replace('.', '_')}.{ext.replace('svg+xml', 'svg')}"
            filepath = os.path.join(output_dir, filename)

            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            return True
    except Exception as e:
        print(f"Domain Failed")
    return False

def process_domains(input_file="domains.txt", output_dir="logos"): #download the logos
    with open(input_file, 'r') as f:
        domains = [line.strip() for line in f.readlines()]

    for domain in domains:
        print(f"Processing {domain}...")
        success = download_logo(domain, output_dir)
        if success:
            print(f"Downloaded logo for {domain}")
        else:
            print(f"Failed to download logo for {domain}")

if __name__ == "__main__":#create logos directory if not exists
    Path("logos").mkdir(exist_ok=True)
    process_domains()

ORB + Color Preprocess

In [33]:
import cv2
import numpy as np
import os
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict
from joblib import Parallel, delayed

def load_images_from_folder(folder):
    filenames = []
    images = []
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
        if img is not None:
            if img.ndim == 3 and img.shape[2] == 4:#handle transparency & convert to RGB
                img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


            if img.dtype == np.uint16:#16-bit to 8-bit - had some problems with 16 bit
                img = (img // 256).astype(np.uint8)

            img = cv2.resize(img, (200, 200))
            filenames.append(filename)
            images.append(img)
    return filenames, images

def extract_features(img): #Extract optimized color and shape features

    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)#color histogram
    color_hist = cv2.calcHist([hsv], [0, 1], None, [90, 128], [0, 180, 0, 256])
    cv2.normalize(color_hist, color_hist)

    orb = cv2.ORB_create(nfeatures=50)  #reduced keypoints
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) #shape features using ORB
    kps = orb.detect(gray, None)

    spatial_hist = np.zeros((4, 4), dtype=np.float32)#spatial hist of keypoints
    if kps:
        for kp in kps:
            x, y = kp.pt
            i = min(int(y // 50), 3)
            j = min(int(x // 50), 3)
            spatial_hist[i, j] += 1
        spatial_hist /= len(kps)  # Normalize

    return color_hist.flatten(), spatial_hist.flatten()

def compute_similarity(args):

    i, j, color_feats, shape_feats = args
    color_sim = cv2.compareHist(color_feats[i], color_feats[j], cv2.HISTCMP_CORREL)#Parallel similarity comp
    shape_sim = cv2.compareHist(shape_feats[i], shape_feats[j], cv2.HISTCMP_CORREL)
    return (i, j, 1 - (0.6 * color_sim + 0.4 * shape_sim))  #weighted combination - shapes seem to be similar from one logo to another

def get_cluster_description(cluster_id, members):
    size = len(members)#generate cluster description
    if size == 1:
        return "Unique item with no close matches"
    return f"Group of {size} similar logos (color weight: 60%, shape weight: 40%)" #best values for the weights

def main():
    folder = 'logos2'
    filenames, images = load_images_from_folder(folder)


    features = Parallel(n_jobs=-1)(delayed(extract_features)(img) for img in images) #feature extraction
    color_feats, shape_feats = zip(*features)

    n = len(filenames)#create list of pairs to compare
    pairs = [(i, j, color_feats, shape_feats) for i in range(n) for j in range(i+1, n)]

    results = Parallel(n_jobs=-1)(delayed(compute_similarity)(pair) for pair in pairs)#similarity calc

    distance_matrix = np.zeros((n, n))#build distance matrix
    for i, j, dist in results:
        distance_matrix[i, j] = dist
        distance_matrix[j, i] = dist

    clusterer = AgglomerativeClustering(
        metric='precomputed',
        linkage='average',
        distance_threshold=0.35,
        n_clusters=None
    )
    clusters = clusterer.fit_predict(distance_matrix)


    cluster_groups = defaultdict(list)#organize clusters
    for idx, cluster_id in enumerate(clusters):
        cluster_groups[int(cluster_id)].append(filenames[idx])

    output_data = {
        "metadata": {
            "algorithm": "AgglomerativeClustering",
            "parameters": {
                "linkage_method": "average",
                "distance_threshold": 0.35,
                "feature_weights": {
                    "color_histogram": 0.6,
                    "shape_distribution": 0.4
                },
                "total_clusters": len(cluster_groups),
                "total_images": n
            }
        },
        "clusters": []
    }

    for cluster_id, members in cluster_groups.items():
        output_data["clusters"].append({
            "cluster_id": cluster_id,
            "member_count": len(members),
            "members": members,
            "characteristics": {
                "description": get_cluster_description(cluster_id, members),
                "average_similarity": "N/A",  # Could be calculated from distance_matrix
                "unique_features": ["color", "shape"]  # Based on our feature weights
            }
        })
    output_file = "ORB+color_preprocess_cluster.json"
    with open(output_file, 'w') as f:
        json.dump(output_data, f, indent=4, ensure_ascii=False)

    print(f"Clustering completed. Results saved to {output_file}")

if __name__ == "__main__":
    main()

Computing pairwise similarities...
Clustering...
Clustering completed. Results saved to cluster.json


JSON VIEWER W/O histogram

In [40]:
import json
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from textwrap import wrap
import math

def visualize_clusters(json_path, output_filename):
    with open(json_path) as f:
        data = json.load(f)

    IMAGE_SIZE = (3840, 4000) #configure 4k picture size - should be adjusted based on the number of samples
    DPI = 100
    FONT_SIZE = 14
    BOX_PADDING = 20
    TEXT_PADDING = 10
    LINE_HEIGHT = FONT_SIZE * 1.5
    COLUMNS = 2  #columns for cluster boxes

    plt.figure(figsize=(IMAGE_SIZE[0]/DPI, IMAGE_SIZE[1]/DPI), dpi=DPI)
    ax = plt.gca()
    ax.set_xlim(0, IMAGE_SIZE[0])
    ax.set_ylim(IMAGE_SIZE[1], 0)
    ax.axis('off')
    clusters = data['clusters']
    num_clusters = len(clusters)
    rows = math.ceil(num_clusters / COLUMNS)
    box_width = (IMAGE_SIZE[0] - BOX_PADDING*(COLUMNS+1)) // COLUMNS
    box_height = (IMAGE_SIZE[1] - BOX_PADDING*(rows+1)) // rows
    for idx, cluster in enumerate(clusters):
        row = idx // COLUMNS
        col = idx % COLUMNS
        x = BOX_PADDING + col * (box_width + BOX_PADDING)
        y = BOX_PADDING + row * (box_height + BOX_PADDING)
        ax.add_patch(Rectangle(
            (x, y), box_width, box_height,
            linewidth=1, edgecolor='#333333', facecolor='none'
        ))
        header_text = [
            f"Cluster ID: {cluster['cluster_id']}",
            f"Description: {cluster['characteristics']['description']}",
            f"Members: {cluster['member_count']} items",
            f"Features: {', '.join(cluster['characteristics']['unique_features'])}"
        ]
        text_y = y + TEXT_PADDING
        for line in header_text:
            ax.text(
                x + TEXT_PADDING, text_y, line,
                fontsize=FONT_SIZE, ha='left', va='top',
                fontfamily='monospace', color='#333333',
                linespacing=1.2
            )
            text_y += LINE_HEIGHT
        member_text = "\n".join([f"• {member}" for member in cluster['members']])
        wrapped_members = wrap(member_text, width=int(box_width/(FONT_SIZE*0.6)))
        max_lines = (box_height - (text_y - y) - TEXT_PADDING) // LINE_HEIGHT
        if len(wrapped_members) > max_lines:
            wrapped_members = wrapped_members[:int(max_lines)] + [
                f"+ {len(cluster['members']) - max_lines} more items..."
            ]
        text_y += LINE_HEIGHT
        for line in wrapped_members:
            ax.text(
                x + TEXT_PADDING, text_y, line,
                fontsize=FONT_SIZE, ha='left', va='top',
                fontfamily='monospace', color='#666666',
                linespacing=1.2
            )
            text_y += LINE_HEIGHT
    plt.savefig(output_filename, bbox_inches='tight', pad_inches=0.1, dpi=DPI)
    plt.close()
    print(f"Saved")

visualize_clusters("ORB+color_preprocess_cluster.json", "ORB+color_preprocess_agglomerative_clustering.png")

Saved
