In [10]:
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import shutil

# Function to initialize Firefox driver
def initialize_driver():
    options = Options()
    options.headless = True  # Set headless mode
    driver = Firefox(options=options)
    driver.set_window_size(1920, 1080)
    return driver

# Function to take screenshot of a URL
def take_screenshot(driver, url, file_name):
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    driver.save_screenshot(file_name)

def main():
    # Relative paths
    whitelist_file_path = r'C:\Users\Intern\Desktop\Tanny Manny\detect-phishing-domains-main\detect-phishing-domain-script-screenshot\whitelist.csv'
    screenshot_folder = r'C:\Users\Intern\Desktop\Tanny Manny\detect-phishing-domains-main\detect-phishing-domain-script-screenshot\src_ss'
    
    # If screenshot folder exists, delete it and create a new one
    if os.path.exists(screenshot_folder):
        shutil.rmtree(screenshot_folder)  # Delete existing folder
    os.makedirs(screenshot_folder)  # Recreate the folder

    if os.path.exists(whitelist_file_path):
        driver = initialize_driver()
        with open(whitelist_file_path, "r") as whitelist_file:
            for i, line in enumerate(whitelist_file):
                if i == 0:
                    continue  # Skip the header line
                domain = line.strip()
                # Generate file name based on domain
                file_name = os.path.join(screenshot_folder, domain.replace(".", "_") + ".png")
                # Take screenshot
                take_screenshot(driver, "http://" + domain, file_name)
                print(f"screenshot taken for {domain}")
        driver.quit()
    else:
        print("whitelist.csv file does not exist.")

    

if __name__ == '__main__':
    main()


screenshot taken for www.icicibank.com
screenshot taken for www.hdfcbank.com
screenshot taken for www.rbi.org.in
screenshot taken for www.airtel.in
screenshot taken for msme.gov.in
screenshot taken for state.bihar.gov.in
screenshot taken for www.isro.gov.in
screenshot taken for www.tatapower.com
screenshot taken for www.cowin.gov.in
screenshot taken for www.mohfw.gov.in
screenshot taken for nciipc.gov.in
screenshot taken for registration.ind.in
screenshot taken for pmkusum.mnre.gov.in
screenshot taken for indianrailways.gov.in
screenshot taken for www.irctc.co.in
screenshot taken for yonobusiness.sbi
screenshot taken for www.onlinesbi.sbi
screenshot taken for www.sbicard.com
screenshot taken for www.sbisecurities.in
screenshot taken for retail.onlinesbi.sbiretaillogin.htm
screenshot taken for crsorgi.gov.in


In [4]:
import os
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import subprocess
from PIL import Image
import imagehash

# Function to initialize Firefox driver
def initialize_driver():
    options = Options()
    options.headless = True  # Set headless mode
    driver = Firefox(options=options)
    driver.set_window_size(1920, 1080)
    return driver

# Function to take screenshot of a URL
def take_screenshot(driver, url, filepath):
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    driver.save_screenshot(filepath)  # Save the screenshot to the specified file path

# Function to calculate image similarity
def calculate_image_similarity(image1_path, image2_path):
    try:
        result = subprocess.run(['magick', 'compare', '-metric', 'RMSE', image1_path, image2_path, 'null:'], capture_output=True, text=True)
        output = result.stderr.strip() if result.stderr else result.stdout.strip()
        
        if "Error:" in output:
            print(output)
            similarity_score = 0.0
            normalized_similarity_score = 0.0
        else:
            similarity_score = float(output.split()[0])
            normalized_similarity_score = float(output.split()[1][1:-1])  # Extracting normalized RMSE
        
    except Exception as e:
        print(f"Error calculating image similarity: {e}")
        similarity_score = 0.0
        normalized_similarity_score = 0.0
    
    return similarity_score, normalized_similarity_score


def main():
    # Paths
    blacklist_file_path = r'C:\Users\Intern\Desktop\Tanny Manny\detect-phishing-domains-main\detect-phishing-domain-script-screenshot\blacklist.csv'
    screenshot_folder = r'C:\Users\Intern\Desktop\Tanny Manny\detect-phishing-domains-main\detect-phishing-domain-script-screenshot\src_ss'
    screenshot_filepath = os.path.join(screenshot_folder, "screenshot.png")

    if os.path.exists(blacklist_file_path):
        driver = initialize_driver()

        # Iterate over URLs in blacklist.csv
        with open(blacklist_file_path, "r") as blacklist_file:
            for line in blacklist_file:
                domain = line.strip()

                # Take screenshot of the site and save it as "screenshot.png"
                take_screenshot(driver, "http://" + domain, screenshot_filepath)
                
                min_similarity_scores = []  # List to store top 5 similarity scores
                min_similarity_images = []  # List to store top 5 similar images

                # Iterate over files in src_ss folder
                for existing_image in os.listdir(screenshot_folder):
                    existing_image_path = os.path.join(screenshot_folder, existing_image)
                    # Skip comparison for "screenshot.png"
                    if existing_image == "screenshot.png":
                        continue
                    image_similarity, normalized_image_similarity = calculate_image_similarity(existing_image_path, screenshot_filepath)
                    # Check if the current similarity score is higher than any of the top 5
                    for i, score in enumerate(min_similarity_scores):
                        if normalized_image_similarity < score:
                            min_similarity_scores.insert(i, normalized_image_similarity)
                            min_similarity_images.insert(i, existing_image)
                            # Keep only the top 5 scores and images
                            min_similarity_scores = min_similarity_scores[:5]
                            min_similarity_images = min_similarity_images[:5]
                            break
                    else:
                        # If the score wasn't inserted, append it if there are less than 5 top scores
                        if len(min_similarity_scores) < 5:
                            min_similarity_scores.append(normalized_image_similarity)
                            min_similarity_images.append(existing_image)

                # Print the top 5 similarity scores and their corresponding images
                print(f"The top 5 similar sites for {domain}:")
                for score, image in zip(min_similarity_scores, min_similarity_images):
                    print(f"  Similarity Score: {score}, Image: {image}")

        # Delete the blacklist screenshot after comparison
        if os.path.exists(screenshot_filepath):
            os.remove(screenshot_filepath)
            print("Screenshot 'screenshot.png' deleted.")
        else:
            print("Screenshot 'screenshot.png' does not exist.")

        driver.quit()
    else:
        print("blacklist.csv file does not exist.")

if __name__ == '__main__':
    main()


The top 5 similar sites for crsgovorgi.in:
  Similarity Score: 0.173169, Image: registration_ind_in.png
  Similarity Score: 0.182902, Image: www_onlinesbi_sbi.png
  Similarity Score: 0.223716, Image: www_mohfw_gov_in.png
  Similarity Score: 0.233644, Image: crsorgi_gov_in.png
  Similarity Score: 0.250102, Image: www_airtel_in.png
Screenshot 'screenshot.png' deleted.


In [1]:
#to compare 2 sites
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
import time
import base64
from PIL import Image
import imagehash
from sklearn.feature_extraction.text import TfidfVectorizer
import subprocess
import os

# Initially vectorise using tfidr and then check similarity using cosine similarity
def calculate_text_similarity(paragraph1, paragraph2):
    if not paragraph1 or not paragraph2:
        return 0.0
    
    sentences = [paragraph1, paragraph2]
    vectorizer = TfidfVectorizer()
    
    try:
        tfidf_matrix = vectorizer.fit_transform(sentences)
        cosine_sim = (tfidf_matrix * tfidf_matrix.T).A
        similarity_percentage = cosine_sim[0, 1] * 100
    except ValueError as e:
        print(f"Error calculating similarity: {e}")
        similarity_percentage = 0.0
    
    return similarity_percentage


def calculate_image_hash(image_path):
    # Calculate the perceptual hash (phash) of the image
    hash_value = imagehash.phash(Image.open(image_path))
    return hash_value

def take_full_page_screenshot(url, file_name, retries=3):
    options = Options()
    options.headless = True

    # Check if the file already exists and delete it
    if os.path.exists(file_name):
        os.remove(file_name)

    while retries > 0:
        try:
            driver = Firefox(options=options)
            driver.set_window_size(1920, 1080)
            driver.get(url)
            
            # Use WebDriverWait to wait until the page is fully loaded
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, 'body'))
            )
            
            # Get the dimensions of the page
            total_width = driver.execute_script("return document.body.scrollWidth")
            total_height = driver.execute_script("return document.body.scrollHeight")
            driver.set_window_size(total_width, total_height)
            time.sleep(2)  # Adjust the sleep time if necessary
            
            # Take the screenshot
            screenshot_base64 = driver.get_screenshot_as_base64()
            with open(file_name, "wb") as f:
                f.write(base64.b64decode(screenshot_base64))
            
            driver.quit()
            return

        except WebDriverException as e:
            print(f"Error occurred: {e}")
            retries -= 1
            driver.quit()
            if retries == 0:
                print("Max retries reached. Unable to take screenshot.")
                raise

def compare_text(text1, text2):
    return calculate_text_similarity(text1, text2)

def calculate_image_similarity(image1_path, image2_path):
    try:
        result = subprocess.run(['magick', 'compare', '-metric', 'RMSE', image1_path, image2_path, 'null:'], capture_output=True, text=True)
        output = result.stderr.strip() if result.stderr else result.stdout.strip()
        
        if "Error:" in output:
            print(output)
            similarity_score = 0.0
            normalized_similarity_score = 0.0
        else:
            similarity_score = float(output.split()[0])
            normalized_similarity_score = float(output.split()[1][1:-1])  # Extracting normalized RMSE
        
    except Exception as e:
        print(f"Error calculating image similarity: {e}")
        similarity_score = 0.0
        normalized_similarity_score = 0.0
    
    return similarity_score, normalized_similarity_score
# Check for Domain based analysis.
def main():
    target_url = 'https://crsorgi.gov.in'
    newly_registered_url = 'https://crsgovorgi.in'
    # Paths to the screenshots
    screenshot1_path = r'screenshot1.png'
    screenshot2_path = r'screenshot2.png'


    take_full_page_screenshot(target_url, screenshot1_path)
    take_full_page_screenshot(newly_registered_url, screenshot2_path)

    # Calculate perceptual hashes of the screenshots
    hash1 = calculate_image_hash(screenshot1_path)
    hash2 = calculate_image_hash(screenshot2_path)

    # Compare image similarity
    image_similarity, normalized_image_similarity = calculate_image_similarity(screenshot1_path, screenshot2_path)
    print(f'Image Similarity Score (RMSE): {image_similarity:.2f}')
    print(f'Image Similarity Score (Normalized RMSE): {normalized_image_similarity:.2f}')


if __name__ == '__main__':
    main()


Image Similarity Score (RMSE): 15884.00
Image Similarity Score (Normalized RMSE): 0.24
