In [22]:
import hashlib
import csv
import os
import imagehash
from PIL import Image
def file_checksum(filepath, algo="sha256"):
    """Calculate the checksum of a file."""
    h = hashlib.new(algo)
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            h.update(chunk)
    return h.hexdigest()
def image_visual_hash(image_path):
    try:
        return str(imagehash.average_hash(Image.open(image_path)))
    except Exception as e:
        print(f"Error hashing {image_path}: {e}")
        return None
def extend_csv_with_checksum(csv_path, output_csv, filename_column="filename", base_dir="."):
    """Extend an existing CSV with a checksum column."""
    with open(csv_path, newline="", encoding="utf-8") as infile:
        reader = csv.DictReader(infile)
        rows = list(reader)
        fieldnames = reader.fieldnames + ["checksum", "visual_hash"]

    for row in rows:
        filename = row[filename_column]
        file_path = os.path.join(base_dir, filename + ".jpg")  # Assuming files are .jpg, adjust as necessary
        if os.path.isfile(file_path):
            checksum = file_checksum(file_path)
            image_hash = image_visual_hash(file_path)
        else:
            file_path = os.path.join(base_dir, filename + ".png") 
            if os.path.isfile(file_path):
                checksum = file_checksum(file_path)
                image_hash = image_visual_hash(file_path)
            else:
                checksum = "File not found"
                image_hash = "File not found"
        row["checksum"] = checksum
        row["visual_hash"] = image_hash

    with open(output_csv, "w", newline="", encoding="utf-8") as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

# Example usage:
# create_checksums_from_csv("filenames.csv", "checksums.csv", base_dir="./data")

In [18]:
extend_csv_with_checksum("../../data/public_dataset.csv", "../../data/public_dataset_with_checksum_image.csv", "Image_id",base_dir=r"C:\Users\stani\.cache\kagglehub\datasets\stanislavlevendeev\haz-mat-signs\versions\7\images")

In [23]:
import requests
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

def get_image_urls(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, "html.parser")
    img_tags = soup.find_all("img")
    img_urls = []
    for img in img_tags:
        src = img.get("src")
        if src:
            # check if contains a thumb in the link
            if src.startswith("http") and "thumb" in src:
                img_urls.append(src.replace("thumb", "original"))
            else:
                next()
                img_urls.append(requests.compat.urljoin("https://www.sinanlitanker.com.tr/", src))
    return img_urls
def get_image_urls_dynamic(url, wait_time=10):
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(wait_time)  # Wait for JS to load content; adjust as needed

    img_elements = driver.find_elements(By.TAG_NAME, "img")
    img_urls = []
    for img in img_elements:
        src = img.get_attribute("src")
        if src:
            if src.startswith("http") and "thumb" in src:
                img_urls.append(src.replace("thumb", "original"))
    driver.quit()
    return img_urls
counter = 0
def download_image(url, dest_folder="../../data/downloaded_images"):
    os.makedirs(dest_folder, exist_ok=True)
    resp = requests.get(url, stream=True)
    global counter
    counter += 1
    filename = os.path.join(dest_folder, str(counter)+os.path.basename(url.split("?")[0]))
    if resp.status_code == 200:
        with open(filename, "wb") as f:
            for chunk in resp.iter_content(1024):
                f.write(chunk)
        return filename
    return None

def image_visual_hash(image_path):
    try:
        return str(imagehash.average_hash(Image.open(image_path)))
    except Exception as e:
        print(f"Error hashing {image_path}: {e}")
        return None

def update_csv_links_with_direct_images(page_url, csv_path, output_csv, checksum_column="checksum", link_column="link", use_visual_hash=True):
    # 1. Scrape and download images, compute checksums and visual hashes
    img_urls = get_image_urls_dynamic(page_url)
    checksum_to_url = {}
    visualhash_to_url = {}
    for img_url in img_urls:
        print(f"Processing: {img_url}")
        local_path = download_image(img_url)
        if local_path:
            checksum = file_checksum(local_path)
            checksum_to_url[checksum] = img_url
            if use_visual_hash:
                vhash = image_visual_hash(local_path)
                if vhash:
                    visualhash_to_url[vhash] = img_url

    # 2. Read the original CSV and update links if checksum or visual hash matches
    with open(csv_path, newline="", encoding="utf-8") as infile:
        reader = csv.DictReader(infile)
        rows = list(reader)
        fieldnames = reader.fieldnames

    for row in rows:
        checksum = row.get(checksum_column)
        updated = False
        # Try file checksum match first
        if checksum in checksum_to_url:
            row[link_column] = checksum_to_url[checksum]
            updated = True
        # If not found, try visual hash match
        elif use_visual_hash:
            # Compute visual hash for local file
            filename = row.get("filename") or row.get("Image_id") or row.get("image_name")
            if filename:
                # Try .jpg and .png
                for ext in [".jpg", ".png"]:
                    local_file = os.path.join(r"C:\Users\stani\.cache\kagglehub\datasets\stanislavlevendeev\haz-mat-signs\versions\7\images", filename + ext)
                    if os.path.isfile(local_file):
                        vhash = image_visual_hash(local_file)
                        if vhash and vhash in visualhash_to_url:
                            row[link_column] = visualhash_to_url[vhash]
                            updated = True
                            break
            if not updated:
                row[link_column] = row.get(link_column, "")

    # 3. Write updated CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

In [27]:

# Example usage:
update_csv_links_with_direct_images(
    page_url="https://universe.roboflow.com/masters-thesis-yolo-model/adr-tankers-detection/browse?queryText=&pageSize=200&startingIndex=0&browseQuery=true",
    csv_path="../../data/public_dataset_with_checksum_image_updated.csv",
    output_csv="../../data/public_dataset_with_checksum_image_updated_2.csv",
    checksum_column="checksum",
    link_column="Link"
)

Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/OMoG5MiCkD1bz8uQMrnp/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/qAHNGmcYp0m5ezvxcPQH/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/08qq4lFVxmJQZ6hCVQD5/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/eNI1H2fEPbbBGOXIiloc/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/zTmAIyMHfbthnlCLaZW4/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/Zb55M2bOHl4PJvMc1Po6/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/UXdTQWbpFW64mCH9UiTX/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/Wsq4bG2JIUsEXlVoMooe/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/rNgCMy0iMoVGTfe4r476/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB4ilZN5GngD/ulwkTKFF3vgmCzLfaXR4/original.jpg
Processing: https://source.roboflow.com/XZtYbRZ7qB