In [35]:
import os
import csv
import requests
from bs4 import BeautifulSoup
import urllib
import shutil

def get_domain_logo_urls(domain):
    # Construct the search URL
    search_query = f"{domain} logo"
    search_url = f"https://www.google.com/search?q={search_query}&tbm=isch"

    # Send a request to Google Images
    response = requests.get(search_url)
    
    # Parse the HTML response
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all image results
    img_tags = soup.find_all('img')
    
    # Initialize a list to store image URLs
    img_urls = []
    
    # Iterate over the first three image tags
    for img_tag in img_tags[1:4]:
        # Get the image URL
        img_url = img_tag['src']
        # If it's a relative URL, prepend the Google Images domain
        if not img_url.startswith('http'):
            img_url = f"https://www.google.com{img_url}"
        img_urls.append(img_url)
    
    return img_urls

def download_images(urls, directory, prefix):

    # Create the directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)
    
    # Download and save each image
    for i, url in enumerate(urls):
        filename = f"{prefix}_{i+1}.jpg"
        urllib.request.urlretrieve(url, os.path.join(directory, filename))

def delete_directory_if_exists(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)

# Read domains from whitelist.csv
with open('LogoList.csv', 'r') as file:
    delete_directory_if_exists("OrgLogos")
    reader = csv.reader(file)
    for row in reader:
        domain = row[0].strip()  # Assuming domain is in the first column
        logo_urls = get_domain_logo_urls(domain)
        if logo_urls:
            download_images(logo_urls, "OrgLogos", domain)


In [36]:
import requests
from bs4 import BeautifulSoup
import os
import shutil
from urllib.parse import urljoin, urlparse
from PIL import Image
from io import BytesIO

def delete_directory_if_exists(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)

def download_images_from_url(url, save_dir):
    # Create the directory to save the images
    os.makedirs(save_dir, exist_ok=True)

    # Start a session to persist headers and cookies
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    })

    # Fetch the HTML content
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all image tags
    img_tags = soup.find_all('img')

    # Extract image URLs and download them
    for img in img_tags:
        img_url = img.get('src')
        
        if img_url:
            # If the URL is relative, make it absolute
            img_url = urljoin(url, img_url)
            
            try:
                # Verify if URL is correctly formed
                parsed_url = urlparse(img_url)
                if not parsed_url.scheme or not parsed_url.netloc:
                    raise ValueError(f"Invalid URL: {img_url}")
                
                # Extract image name
                img_name = os.path.join(save_dir, os.path.basename(parsed_url.path))
                
                # Log URL being downloaded
                print(f"Attempting to download: {img_url}")
                
                # Download the image
                img_response = session.get(img_url, allow_redirects=True)
                if img_response.status_code == 200:
                    with open(img_name, 'wb') as f:
                        f.write(img_response.content)
                    
                    # Save the image as PNG if there is data
                    image = Image.open(BytesIO(img_response.content))
                    if image:
                        png_name = os.path.splitext(img_name)[0] + '.png'
                        image.save(png_name)
                        print(f"Saved as PNG: {png_name}")
                        
                    print(f"Downloaded: {img_name}")
                else:
                    print(f"Failed to download {img_url}: HTTP Status {img_response.status_code}")
            except Exception as e:
                print(f"Failed to download {img_url}: {e}")

    print(f"Image downloading process completed for {url}.")

# URLs to download images from
#url1 = actual 
#url2 = fake
# realurl = 'https://www.google.co.in/'
# fakeurl = 'https://channelhub.info/t6df29443a8b6c4014s85a942b4o5263fbbc.html'
realurl="https://scigovn.in"
# Directories to save the images
save_dir1 = "url_images"


# Delete directories if they already exist
delete_directory_if_exists(save_dir1)

# Download images from both URLs
download_images_from_url(realurl, save_dir1)

Attempting to download: https://scigovn.in/static/picture/search_icon.svg
Failed to download https://scigovn.in/static/picture/search_icon.svg: cannot identify image file <_io.BytesIO object at 0x0000022E31771C10>
Attempting to download: https://scigovn.in/static/picture/social_media.svg
Failed to download https://scigovn.in/static/picture/social_media.svg: cannot identify image file <_io.BytesIO object at 0x0000022E33982CF0>
Attempting to download: https://scigovn.in/static/picture/youtube_icon.svg
Failed to download https://scigovn.in/static/picture/youtube_icon.svg: cannot identify image file <_io.BytesIO object at 0x0000022E34DDA2A0>
Attempting to download: https://scigovn.in/static/picture/sitemap_icon.svg
Failed to download https://scigovn.in/static/picture/sitemap_icon.svg: cannot identify image file <_io.BytesIO object at 0x0000022E34764540>
Attempting to download: https://scigovn.in/static/picture/accessibility_icon.svg
Failed to download https://scigovn.in/static/picture/acce

In [37]:
import cv2
import numpy as np
import os
import subprocess

def load_images_from_directory(directory):
    images = []
    for filename in os.listdir(directory):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(directory, filename)
            image = cv2.imread(image_path)
            if image is not None:
                images.append((filename, image))
    return images

logo_images = load_images_from_directory("OrgLogos")

url_images = load_images_from_directory("url_images")

def calculate_image_similarity(image1_path, image2_path):
    try:
        # Use ImageMagick's compare command to calculate similarity
        result = subprocess.run(['magick', 'compare', '-metric', 'RMSE', image1_path, image2_path, 'null:'], capture_output=True, text=True)
        output = result.stderr.strip() if result.stderr else result.stdout.strip()
        
        if "Error:" in output:
            print(output)
            similarity_score = 0.0
            normalized_similarity_score = 0.0
        else:
            similarity_score = float(output.split()[0])
            normalized_similarity_score = float(output.split()[1][1:-1])  # Extracting normalized RMSE
        
    except Exception as e:
        print(f"Error calculating image similarity: {e}")
        similarity_score = 0.0
        normalized_similarity_score = 0.0
    
    return similarity_score, normalized_similarity_score

# Compare each image in url_images with each logo image
similarities = []
for url_image_name, url_image in url_images:
    for logo_name, logo_image in logo_images:
        image1_path = os.path.join("url_images", url_image_name)
        image2_path = os.path.join("OrgLogos", logo_name)
        similarity_score, normalized_similarity_score = calculate_image_similarity(image1_path, image2_path)
        similarities.append((url_image_name, logo_name, similarity_score, normalized_similarity_score))

# Find the two images with the highest similarity
top_similarities = sorted(similarities, key=lambda x: x[3])[:10]

# Print the results
for url_image_name, logo_name, similarity_score, normalized_similarity_score in top_similarities:
    print(f"Image: {url_image_name}, Logo: {logo_name}, Similarity Score: {similarity_score}, Normalized Similarity: {normalized_similarity_score}")


Image: NALSA.png, Logo: Unique Identification Authority of India_1.jpg, Similarity Score: 7283.29, Normalized Similarity: 0.111136
Image: NALSA.png, Logo: Supreme Court of India_1.jpg, Similarity Score: 7931.75, Normalized Similarity: 0.121031
Image: NALSA.png, Logo: Ministry of Health and Family Welfare_3.jpg, Similarity Score: 8744.13, Normalized Similarity: 0.133427
Image: NALSA.png, Logo: Tata Power Company Limited_2.jpg, Similarity Score: 9064.25, Normalized Similarity: 0.138312
Image: NALSA.png, Logo: Indian Space Research Organisation (ISRO)_3.jpg, Similarity Score: 9090.12, Normalized Similarity: 0.138706
Image: SCLSC.jpg, Logo: Unique Identification Authority of India_1.jpg, Similarity Score: 9133.04, Normalized Similarity: 0.139361
Image: SCLSC.png, Logo: Unique Identification Authority of India_1.jpg, Similarity Score: 9133.04, Normalized Similarity: 0.139361
Image: NALSA.png, Logo: Indian Space Research Organisation (ISRO)_2.jpg, Similarity Score: 9294.87, Normalized Simila