In [4]:
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

def isSimilar(sitename, similarity_threshold):
    # Read whitelist
    whitelist = pd.read_csv("./domain-names-tmp.csv")
    whitelist["Domain"] = whitelist["Domain"].str.strip()
    whitelist_array = whitelist["Domain"].values

    # Create 2d array with columns domain name and similarity index
    new_whitelist_array = np.zeros((len(whitelist_array), 2), dtype='object')
    new_whitelist_array[:, 0] = whitelist_array

    # If legit, exit if not continue
    if sitename in whitelist_array:
        print(sitename + " is present")
        return

    # Compare every character of input name and sitename
    for i in range(len(new_whitelist_array)):
        new_whitelist_array[i][1] = SequenceMatcher(None, new_whitelist_array[i][0], sitename).ratio()

    # Sort by similarity and filter by threshold
    filtered_results = [row for row in new_whitelist_array if row[1] >= similarity_threshold]
    filtered_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)

    # Print the first ten websites that meet the threshold
    for row in filtered_results[:10]:
        print(row)

# Example usage
isSimilar("sbi.co.in", 0.5) # Replace 0.8 with your desired threshold


['s-bi.co.in' 0.9473684210526315]
['bgmi.co.in' 0.8421052631578947]
['lsip.co.in' 0.8421052631578947]
['sgoi.co.in' 0.8421052631578947]
['simr.co.in' 0.8421052631578947]
['slbs.co.in' 0.8421052631578947]
['subh.co.in' 0.8421052631578947]
['yusi.co.in' 0.8421052631578947]
['abchi.co.in' 0.8]
['bachi.co.in' 0.8]


In [5]:
import tldextract

def isSimilar(sitename, similarity_threshold):
    # Extract the main part of the domain
    extracted = tldextract.extract(sitename)
    main_part = extracted.domain

    # Read whitelist
    whitelist = pd.read_csv("./domain-names-tmp.csv")
    whitelist["Domain"] = whitelist["Domain"].str.strip()
    whitelist_array = whitelist["Domain"].values

    # Filter domains containing the main part
    filtered_domains = [domain for domain in whitelist_array if main_part in tldextract.extract(domain).domain]

    # Create 2d array for these domains with similarity scores
    filtered_array = np.zeros((len(filtered_domains), 2), dtype='object')
    filtered_array[:, 0] = filtered_domains
    for i in range(len(filtered_array)):
        filtered_array[i][1] = SequenceMatcher(None, filtered_array[i][0], sitename).ratio()

    # Sort by similarity and filter by threshold
    filtered_results = [row for row in filtered_array if row[1] >= similarity_threshold]
    filtered_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)

    # If less than 10 results, consider additional domains from the original list
    if len(filtered_results) < 10:
        additional_results = [row for row in new_whitelist_array if row[1] >= similarity_threshold 
                              and tldextract.extract(row[0]).domain != main_part]
        additional_results = sorted(additional_results, key=lambda x: x[1], reverse=True)
        filtered_results.extend(additional_results[:10 - len(filtered_results)])

    # Print the first ten websites that meet the threshold
    for row in filtered_results[:10]:
        print(row)

# Example usage
isSimilar("sbi.co.in", 0.5)  # Replace 0.5 with your desired threshold


['sbi-card.in' 0.7]
['sbiifsccode.co.in' 0.6923076923076923]
['5sbiz.com' 0.6666666666666666]
['sbicards.org.in' 0.6666666666666666]
['zsbiy.com' 0.6666666666666666]
['icdsbih.in' 0.631578947368421]
['ksbiju.com' 0.631578947368421]
['pasbiz.com' 0.631578947368421]
['sbi823.com' 0.631578947368421]
['zj-sbi.com' 0.631578947368421]


In [6]:
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

def isSimilar(sitename):
    
    #read whitelist
    # Specify the path to your text file
    file_path = "domain-names-tmp.txt"
    #Open the file in read mode
    with open(file_path, "r") as file:
    # Read the file line by line and create a 2D array
        new_whitelist_array = [[line.strip(), 0] for line in file]


    #if legit exit if not continue
    if sitename not in new_whitelist_array:
        print("absent")
    else:
        print(sitename +" is present")
        exit()

    #compare every character of input name and sitename
    for i in range(len(new_whitelist_array)):
        new_whitelist_array[i][1] = SequenceMatcher(None, new_whitelist_array[i][0], sitename).ratio()

    '''#find maximum and display
    max=0
    for i in range(len(new_whitelist_array)):
        if new_whitelist_array[i][1]>new_whitelist_array[max][1]: #and len(new_whitelist_array[max][0])>=new_whitelist_array[max][1]:
            max = i'''

    #percent=((new_whitelist_array[max][1])/len(new_whitelist_array[max][0]))*100

    new_whitelist_array = sorted(new_whitelist_array, key=lambda x: x[1], reverse=True)

    #print("resembles "+ new_whitelist_array[max][0]+" by "+str(new_whitelist_array[max][1]))

    for i in range(len(new_whitelist_array)):
        if new_whitelist_array[i][1]>0.57:
            print(new_whitelist_array[i])
    #for row in new_whitelist_array[:10]:
     #   print(row)

print("the similar website to irctc can be:")
isSimilar("irctc.gov.in")
print()
print("the similar website to icici can be:")
isSimilar("icicibank.co.in")

the similar website to irctc can be:
absent
['ircc.co.in', 0.8181818181818182]
['irct.co.in', 0.8181818181818182]
['icnt.co.in', 0.7272727272727273]
['directspaces.co.in', 0.6666666666666666]
['drtalc.co.in', 0.6666666666666666]
['fitgro.in', 0.6666666666666666]
['fitgro.in', 0.6666666666666666]
['rtf.co.in', 0.6666666666666666]
['circleinc.org.in', 0.6428571428571429]
['firstcheck.co.in', 0.6428571428571429]
['pcbcircuit.co.in', 0.6428571428571429]
['aircubs.co.in', 0.64]
['amirati.co.in', 0.64]
['girlygrove.in', 0.64]
['thirty3.co.in', 0.64]
['tictic.online', 0.64]
['bacc.co.in', 0.6363636363636364]
['brec.co.in', 0.6363636363636364]
['cultcon.in', 0.6363636363636364]
['direct.ink', 0.6363636363636364]
['hsrt.co.in', 0.6363636363636364]
['ibts.co.in', 0.6363636363636364]
['iraz.co.in', 0.6363636363636364]
['irect.info', 0.6363636363636364]
['iten.co.in', 0.6363636363636364]
['itim.co.in', 0.6363636363636364]
['recreto.in', 0.6363636363636364]
['rtha.co.in', 0.6363636363636364]
['simr

In [7]:
import requests
import favicon
from PIL import Image
from io import BytesIO
from skimage.metrics import structural_similarity as ssim
import numpy as np

try:
    print("Fetching favicons from the first URL...")
    icons1 = favicon.get('https://crsorgi-govi.com/web/index.php/auth/index')
    if not icons1:
        raise ValueError("No favicons found for the first URL")
    icon1 = icons1[0]
    print(f"Favicon URL 1: {icon1.url}")

    print("Fetching favicons from the second URL...")
    icons2 = favicon.get('https://crsorgisgovt.com/web/index.php/auth/login.php')
    if not icons2:
        raise ValueError("No favicons found for the second URL")
    icon2 = icons2[0]
    print(f"Favicon URL 2: {icon2.url}")

    print("Downloading first favicon...")
    response1 = requests.get(icon1.url, stream=True)
    response1.raise_for_status()  # Check if the request was successful
    favicon_data1 = response1.content  # Store the favicon data in memory

    print("First favicon downloaded successfully")

    print("Downloading second favicon...")
    response2 = requests.get(icon2.url, stream=True)
    response2.raise_for_status()  # Check if the request was successful
    favicon_data2 = response2.content  # Store the favicon data in memory

    print("Second favicon downloaded successfully")

    # Load images using PIL
    image1 = Image.open(BytesIO(favicon_data1)).convert('L')
    image2 = Image.open(BytesIO(favicon_data2)).convert('L')

    # Convert images to numpy arrays
    image1_np = np.array(image1)
    image2_np = np.array(image2)

    # Resize images to the same size if necessary
    if image1_np.shape != image2_np.shape:
        image2_np = np.resize(image2_np, image1_np.shape)

    # Calculate SSIM between the two images
    similarity_index, _ = ssim(image1_np, image2_np, full=True)

    print(f"The similarity index between the two favicons is: {similarity_index * 100:.2f}%")

except Exception as e:
    print(f"An error occurred: {e}")

Fetching favicons from the first URL...
Favicon URL 1: https://crsorgi-govi.com/web/images/favicon.ico
Fetching favicons from the second URL...
Favicon URL 2: https://crsorgisgovt.com/web/images/favicon.ico
Downloading first favicon...
First favicon downloaded successfully
Downloading second favicon...
Second favicon downloaded successfully
The similarity index between the two favicons is: 100.00%


In [8]:
import base64
import requests
import zipfile
import os
from datetime import datetime

def download_and_extract_csv(start_date):
    str_start_date = start_date.strftime("%Y-%m-%d")
    print("Trying to download " + str_start_date + " data")
    date_zip = str_start_date + ".zip"
    random_str = base64.b64encode(date_zip.encode("utf-8")).decode("utf-8")
    url = "https://www.whoisds.com/whois-database/newly-registered-domains/" + random_str + "/nrd"
    response = requests.get(url)
    if response.status_code == 200:
        zip_filename = date_zip
        with open(zip_filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {zip_filename}")
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall() 
        print(f"Extracted {zip_filename}")
    else:
        print(f"Failed to download data. Status code: {response.status_code}")

if __name__ == "__main__":
    start_date = datetime(2024, 5, 29)
    download_and_extract_csv(start_date)

Trying to download 2024-05-29 data
Downloaded 2024-05-29.zip
Extracted 2024-05-29.zip
