In [2]:
import requests


def get_url(url: str) -> str | None:
    try:
        response = requests.get(url, allow_redirects=True)
        return response.content.decode(errors="ignore")
    except Exception as e:
        print(f"Error fetching content from {url}: {e}")
        return None


print(get_url("https://www.google.com"))

<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en-IN"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="6Gbpg0RCfAOU7etUGt1dUg">(function(){var _g={kEI:'zGF_ZsTkO_eq4-EP8am00AQ',kEXPI:'0,3700270,679,432,5,23,38,5,448460,90133,2872,2891,11754,61296,16105,18161,45421,43893,56384,2,16737,23024,6699,41946,54824,2913,2,2,1,23827,10960,23351,22435,9779,62657,33565,39614,3030,15816,1804,7734,25,13252,14258,13448,40382,4565,11412,5211785,742,148,622,5991809,2839759,16,527,242,3,18,3,1,51,1,46,27998141,43886,3,318,4,1281,3,2124363,23029351,4117,2752,1294,10336,2708,8028,8639,22904,5121,24916,11954,10511,2370,6407,2863,10983,10474,2478,2211,7982,201,5928,14786,24576,2299,1840,1950,7321,814,7766,3821,3006,1792,1954,155,2,2482,13503,6451,1285,6598,2,2539,680,4,285,539,3092,207,121,2668,549,4,3004,273,3110,3377,800,4,3

In [3]:
from bs4 import BeautifulSoup


def extract_content(response: str) -> str | None:
    if not response:
        return None
    soup = BeautifulSoup(response, 'html.parser')
    text_content = ' '.join([element.get_text() for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'a'])])
    return text_content


print(extract_content(get_url("https://accounts.google.com")))

Sign in Use your Google Account  Forgot email? Forgot email? Not your computer? Use a private browsing window to sign in. Learn more about using Guest mode Learn more about using Guest mode Create account Help Privacy Terms


In [4]:
import tldextract


def get_main_domain(url: str):
    ext = tldextract.extract(url)
    return f"{ext.domain}.{ext.suffix}"


print(get_main_domain("https://www.google.com"))

google.com


In [5]:
def classify_features(url: str, content: str):
    features = {
        "url": url,
        "domain": get_main_domain(url),
        "content": content
    }
    return features


print(classify_features("https://www.google.com", extract_content(get_url("https://www.google.com"))))

{'url': 'https://www.google.com', 'domain': 'google.com', 'content': 'Images Maps Play YouTube News Gmail Drive More » Web History Settings Sign in Advanced search हिन्दी বাংলা తెలుగు मराठी தமிழ் ગુજરાતી ಕನ್ನಡ മലയാളം ਪੰਜਾਬੀ Advertising Business Solutions About Google Google.co.in © 2024 - Privacy - Terms Privacy Terms'}


In [6]:
import time


def get_features_from_site(real_sites: list[str]) -> list[dict[str, str]]:
    real_sites_features = []
    for site in real_sites:
        print(f"Getting feature from {site}...", end=' ')
        time_init = time.time()
        content = extract_content(get_url(site))

        if content:
            features = classify_features(site, content)
            real_sites_features.append(features)
            print(f"Done", end=' ')
        else:
            print(f"Failed", end=' ')

        time_finish = time.time()
        time_taken = round(time_finish - time_init, 4)
        print(f"[{time_taken} secs]")

    return real_sites_features


print(get_features_from_site(["https://www.google.com", "https://www.facebook.com"]))

Getting feature from https://www.google.com... Done [0.4465 secs]
Getting feature from https://www.facebook.com... Done [0.54 secs]
[{'url': 'https://www.google.com', 'domain': 'google.com', 'content': 'Images Maps Play YouTube News Gmail Drive More » Web History Settings Sign in Advanced search हिन्दी বাংলা తెలుగు मराठी தமிழ் ગુજરાતી ಕನ್ನಡ മലയാളം ਪੰਜਾਬੀ Advertising Business Solutions About Google Google.co.in © 2024 - Privacy - Terms Privacy Terms'}, {'url': 'https://www.facebook.com', 'domain': 'facebook.com', 'content': 'Facebook আপনাকে লোকেদের সাথে সংযুক্ত হতে ও নানা বিষয় শেয়ার করতে সাহায্য করে।  পাসওয়ার্ড ভুলে গেছেন? নতুন অ্যাকাউন্ট তৈরি করুন একটি পেজ তৈরি করুন हिन्दी اردو नेपाली ଓଡ଼ିଆ English (UK) Español Português (Brasil) Français (France) Deutsch Italiano  সাইন আপ করুন লগ ইন করুন Messenger Facebook Lite ভিডিও স্থান গেমগুলি Marketplace Meta Pay Meta স্টোর Meta Quest Meta AI Instagram Threads অনুদান সংগ্রহ পরিষেবা ভোটিং তথ্য কেন্দ্র প্রাইভেসি পলিসি প্রাইভেসি সেন্টার গ্রুপ সম্

In [7]:
import pandas
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib


def add_features_to_model(features: list[dict[str, int | str]]):
    try:
        existing_data = pandas.read_pickle('../data/real_sites_features_probability.pkl')
        print("Existing data loaded successfully.")
    except FileNotFoundError:
        print("File not found. Initializing empty DataFrame.")
        existing_data = pandas.DataFrame()

    new_data = pandas.DataFrame(features)
    combined_data = pandas.concat([existing_data, new_data], ignore_index=True)

    print(f"Combined data shape: {combined_data.shape}")
    print(f"Combined data head: {combined_data.head()}")

    vectorizer = TfidfVectorizer(stop_words='english')
    real_sites_tfidf = vectorizer.fit_transform(combined_data['content'])

    print(f"TF-IDF shape: {real_sites_tfidf.shape}")

    joblib.dump(vectorizer, '../data/tfidf_vectorizer_real_site_probability.pkl')
    combined_data.to_pickle('../data/real_sites_features_probability.pkl')
    print("Data Updated Successfully")


add_features_to_model(get_features_from_site(["https://www.google.com", "https://www.facebook.com"]))

Getting feature from https://www.google.com... Done [0.4377 secs]
Getting feature from https://www.facebook.com... Done [0.5528 secs]
Existing data loaded successfully.
Combined data shape: (4, 3)
Combined data head:                         url        domain  \
0    https://www.google.com    google.com   
1  https://www.facebook.com  facebook.com   
2    https://www.google.com    google.com   
3  https://www.facebook.com  facebook.com   

                                             content  
0  Images Maps Play YouTube News Gmail Drive More...  
1  Facebook আপনাকে লোকেদের সাথে সংযুক্ত হতে ও নান...  
2  Images Maps Play YouTube News Gmail Drive More...  
3  Facebook আপনাকে লোকেদের সাথে সংযুক্ত হতে ও নান...  
TF-IDF shape: (4, 79)
Data Updated Successfully


In [8]:
from sklearn.metrics.pairwise import cosine_similarity


def check_similarity(url: str, content: str) -> dict[str, str]:
    vectorizer = joblib.load('../data/tfidf_vectorizer_real_site_probability.pkl')
    real_sites_df = pandas.read_pickle('../data/real_sites_features_probability.pkl')

    visiting_domain = get_main_domain(url)

    # If the visiting domain is already in the real sites, it is 100% real
    if visiting_domain in real_sites_df['domain'].values:
        return {"status": "real", "real_domain": visiting_domain, "real_url": url,
                "probability": 1.0}

    real_sites_tfidf = vectorizer.fit_transform(real_sites_df['content'])

    features = classify_features(url, content)
    tfidf_features = vectorizer.transform([features['content']])

    print(f"TF-IDF features shape: {tfidf_features.shape}")

    similarities = cosine_similarity(tfidf_features, real_sites_tfidf).flatten()
    max_similarity_index = similarities.argmax()
    max_similarity = similarities[max_similarity_index]

    print(f"Similarities: {similarities}")
    print(f"Max similarity index: {max_similarity_index}")
    print(f"Max similarity: {max_similarity}")

    real_site = real_sites_df.iloc[max_similarity_index]
    real_site_domain = real_site["domain"]
    real_site_url = real_site["url"]
    provided_url_domain = get_main_domain(url)

    print(f"Max similarity: {max_similarity}")
    print(f"Real site URL: {real_site_url}")
    print(f"Provided URL: {url}")

    if max_similarity > 0.7:
        if real_site_domain != provided_url_domain:
            return {"status": "fake", "real_domain": real_site_domain, "real_url": real_site_url,
                    "probability": max_similarity}
        else:
            return {"status": "real", "real_domain": real_site_domain, "real_url": real_site_url,
                    "probability": max_similarity}
    else:
        return {"status": "uncertain", "real_domain": real_site_domain, "real_url": real_site_url,
                "probability": max_similarity}
    
print(check_similarity("https://www.google.com", extract_content(get_url("https://www.google.com"))))

{'status': 'real', 'real_domain': 'google.com', 'real_url': 'https://www.google.com', 'probability': 1.0}


In [9]:
def check_similarity_using_url(url) -> dict[str, str | None]:
    content = extract_content(get_url(url))

    if content:
        return check_similarity(url, content)
    else:
        print(f"Could not fetch content from {url}")
        return {"status": "error", "real_domain": None, "real_url": None, "probability": None,
                "error": "Could not fetch content"}
    
print(check_similarity_using_url("https://www.google.com"))

{'status': 'real', 'real_domain': 'google.com', 'real_url': 'https://www.google.com', 'probability': 1.0}


In [10]:
def check_url_list(url: list[str]):
    for url_to_check in url:
        result = check_similarity_using_url(url_to_check)
        print(f"Result for {url_to_check}: {result}")
        
check_url_list(["https://www.google.com", "https://www.facebook.com"])

Result for https://www.google.com: {'status': 'real', 'real_domain': 'google.com', 'real_url': 'https://www.google.com', 'probability': 1.0}
Result for https://www.facebook.com: {'status': 'real', 'real_domain': 'facebook.com', 'real_url': 'https://www.facebook.com', 'probability': 1.0}
