In [3]:
import requests
from bs4 import BeautifulSoup
import os
import shutil
from urllib.parse import urljoin, urlparse
from PIL import Image
from io import BytesIO

def delete_directory_if_exists(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)

def download_images_from_url(url, save_dir):
    # Create the directory to save the images
    os.makedirs(save_dir, exist_ok=True)

    # Start a session to persist headers and cookies
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    })

    # Fetch the HTML content
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all image tags
    img_tags = soup.find_all('img')

    # Extract image URLs and download them
    for img in img_tags:
        img_url = img.get('src')
        
        if img_url:
            # If the URL is relative, make it absolute
            img_url = urljoin(url, img_url)
            
            try:
                # Verify if URL is correctly formed
                parsed_url = urlparse(img_url)
                if not parsed_url.scheme or not parsed_url.netloc:
                    raise ValueError(f"Invalid URL: {img_url}")
                
                # Extract image name
                img_name = os.path.join(save_dir, os.path.basename(parsed_url.path))
                
                # Log URL being downloaded
                print(f"Attempting to download: {img_url}")
                
                # Download the image
                img_response = session.get(img_url, allow_redirects=True)
                if img_response.status_code == 200:
                    with open(img_name, 'wb') as f:
                        f.write(img_response.content)
                    
                    # Save the image as PNG if there is data
                    image = Image.open(BytesIO(img_response.content))
                    if image:
                        png_name = os.path.splitext(img_name)[0] + '.png'
                        image.save(png_name)
                        print(f"Saved as PNG: {png_name}")
                        
                    print(f"Downloaded: {img_name}")
                else:
                    print(f"Failed to download {img_url}: HTTP Status {img_response.status_code}")
            except Exception as e:
                print(f"Failed to download {img_url}: {e}")

    print(f"Image downloading process completed for {url}.")

# URLs to download images from
#url1 = actual 
#url2 = fake
url1 = 'https://login.yahoo.com/?.intl=us&done=https%3A%2F%2Fwww.yahoo.com%2F&add=1'
url2 = 'https://pub-bec2426b4a3f4b5aa714d43c72068106.r2.dev/login.htm'

# Directories to save the images
save_dir1 = "url_1_images"
save_dir2 = "url_2_images"

# Delete directories if they already exist
delete_directory_if_exists(save_dir1)
delete_directory_if_exists(save_dir2)

# Download images from both URLs
download_images_from_url(url1, save_dir1)
download_images_from_url(url2, save_dir2)

Attempting to download: https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_p_bestfit_frontpage_2x.png
Saved as PNG: url_1_images\yahoo_frontpage_en-US_s_f_p_bestfit_frontpage_2x.png
Downloaded: url_1_images\yahoo_frontpage_en-US_s_f_p_bestfit_frontpage_2x.png
Attempting to download: https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_w_bestfit_frontpage_2x.png
Saved as PNG: url_1_images\yahoo_frontpage_en-US_s_f_w_bestfit_frontpage_2x.png
Downloaded: url_1_images\yahoo_frontpage_en-US_s_f_w_bestfit_frontpage_2x.png
Attempting to download: https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_p_bestfit_frontpage_2x.png
Saved as PNG: url_1_images\yahoo_frontpage_en-US_s_f_p_bestfit_frontpage_2x.png
Downloaded: url_1_images\yahoo_frontpage_en-US_s_f_p_bestfit_frontpage_2x.png
Attempting to download: https://s.yimg.com/rz/p/yahoo_frontpage_en-US_s_f_w_bestfit_frontpage_2x.png
Saved as PNG: url_1_images\yahoo_frontpage_en-US_s_f_w_bestfit_frontpage_2x.png
Downloaded: url_1_images\yahoo_frontpage_e

In [4]:
import os
from PIL import Image
import imagehash

# Directories containing the images
dir1 = "url_1_images"
dir2 = "url_2_images"

# Get list of image files in both directories
images_dir1 = [os.path.join(dir1, f) for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))]
images_dir2 = [os.path.join(dir2, f) for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))]

# Function to calculate the similarity between two images
def calculate_similarity(image1, image2):
    hash1 = imagehash.phash(image1)
    hash2 = imagehash.phash(image2)
    return hash1 - hash2  # Hamming distance

# Compare each image in dir2 with all images in dir1
results = {}
for img2_path in images_dir2:
    try:
        img2 = Image.open(img2_path)
    except Exception as e:
        print(f"Error opening image {img2_path}: {e}")
        continue
    
    best_match = None
    lowest_distance = float('inf')
    
    for img1_path in images_dir1:
        try:
            img1 = Image.open(img1_path)
        except Exception as e:
            print(f"Error opening image {img1_path}: {e}")
            continue
        
        distance = calculate_similarity(img1, img2)
        if distance < lowest_distance:
            lowest_distance = distance
            best_match = img1_path
    
    if best_match:
        results[img2_path] = (best_match, lowest_distance)

# Print results
for img2_path, (best_match, lowest_distance) in results.items():
    print(f"Image {os.path.basename(img2_path)} is most similar to {os.path.basename(best_match)} with a similarity score (Hamming distance) of {lowest_distance}")

Image yahoo_frontpage_en-US_s_f_p_bestfit_frontpage_2x.png is most similar to yahoo_frontpage_en-US_s_f_p_bestfit_frontpage_2x.png with a similarity score (Hamming distance) of 0
Image yahoo_frontpage_en-US_s_f_w_bestfit_frontpage_2x.png is most similar to yahoo_frontpage_en-US_s_f_w_bestfit_frontpage_2x.png with a similarity score (Hamming distance) of 0




In [24]:
# whitelist image download
# import requests
# from bs4 import BeautifulSoup
# import os
# import shutil
# from urllib.parse import urljoin, urlparse

# def delete_directory_if_exists(directory):
#     if os.path.exists(directory):
#         shutil.rmtree(directory)

# def download_images_from_url(url, save_dir):
#     # Create the directory to save the images
#     os.makedirs(save_dir, exist_ok=True)

#     # Start a session to persist headers and cookies
#     session = requests.Session()
#     session.headers.update({
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
#     })

#     # Fetch the HTML content
#     response = session.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')

#     # Find all image tags
#     img_tags = soup.find_all('img')

#     # Extract image URLs and download them
#     for img in img_tags:
#         img_url = img.get('src')
        
#         if img_url:
#             # If the URL is relative, make it absolute
#             img_url = urljoin(url, img_url)
            
#             try:
#                 # Verify if URL is correctly formed
#                 parsed_url = urlparse(img_url)
#                 if not parsed_url.scheme or not parsed_url.netloc:
#                     raise ValueError(f"Invalid URL: {img_url}")
                
#                 # Extract image name
#                 img_name = os.path.join(save_dir, os.path.basename(parsed_url.path))
                
#                 # Log URL being downloaded
#                 print(f"Attempting to download: {img_url}")
                
#                 # Download the image
#                 img_response = session.get(img_url, allow_redirects=True)
#                 if img_response.status_code == 200:
#                     with open(img_name, 'wb') as f:
#                         f.write(img_response.content)
#                     print(f"Downloaded: {img_name}")
#                 else:
#                     print(f"Failed to download {img_url}: HTTP Status {img_response.status_code}")
#             except Exception as e:
#                 print(f"Failed to download {img_url}: {e}")

#     print(f"Image downloading process completed for {url}.")

# # URLs to download images from
# urls = {
#     "crsorgi-govi.com": "https://crsorgi-govi.com/web/index.php/auth/index",
#     "www.icicibank.com": "https://www.icicibank.com",
#     "www.hdfcbank.com": "https://www.hdfcbank.com",
#     "www.rbi.org.in": "https://www.rbi.org.in",
#     "www.airtel.in": "https://www.airtel.in",
#     "msme.gov.in": "https://msme.gov.in",
#     "state.bihar.gov.in": "https://state.bihar.gov.in",
#     "uidai.gov.in": "https://uidai.gov.in",
#     "www.isro.gov.in": "https://www.isro.gov.in",
#     "www.tatapower.com": "https://www.tatapower.com",
#     "www.cowin.gov.in": "https://www.cowin.gov.in",
#     "www.mohfw.gov.in": "https://www.mohfw.gov.in",
#     "parivahan.gov.in": "https://parivahan.gov.in",
#     "nciipc.gov.in": "https://nciipc.gov.in",
#     "registration.ind.in": "https://registration.ind.in",
#     "pmkusum.mnre.gov.in": "https://pmkusum.mnre.gov.in",
#     "indianrailways.gov.in": "https://indianrailways.gov.in",
#     "www.irctc.co.in": "https://www.irctc.co.in",
#     "www.onlinesbi.sbi":"https://www.onlinesbi.sbi"
# }

# # Download images from all URLs
# for domain, url in urls.items():
#     save_dir = f"{domain}_images"
#     delete_directory_if_exists(save_dir)
#     download_images_from_url(url, save_dir)
