<a href="https://colab.research.google.com/github/Pranesh-VM/Phishing_website_detection_CIP/blob/main/CIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torch scikit-learn networkx nltk


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [6]:
import re
import networkx as nx
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import DistilBertTokenizer, DistilBertModel

In [2]:
def tokenize_url(url):
    url = url.lower()
    url = re.sub(r"https?://|www\.", "", url)  # Remove protocol & "www."
    tokens = re.split(r'[\./\-_?=&]', url)  # Split by common delimiters
    return [token for token in tokens if token]  # Remove empty tokens


In [3]:
def extract_text_rank_keywords(tokens):
    graph = nx.Graph()
    window_size = 3  # Sliding window size
    for i in range(len(tokens)):
        for j in range(i+1, min(i + window_size, len(tokens))):
            if graph.has_edge(tokens[i], tokens[j]):
                graph[tokens[i]][tokens[j]]["weight"] += 1  # Increase weight
            else:
                graph.add_edge(tokens[i], tokens[j], weight=1)  # Add edge
    scores = nx.pagerank(graph, weight="weight")  # Compute TextRank
    ranked_keywords = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [word for word, score in ranked_keywords[:5]]  # Return top 5

In [4]:
def extract_statistical_features(url):
    domain = re.sub(r"https?://|www\.", "", url).split("/")[0]  # Extract domain
    return {
        "url_length": len(url),
        "domain_length": len(domain),
        "num_special_chars": len(re.findall(r'[-_?=&.]', url)),
        "num_subdomains": url.count("."),
        "has_ip_address": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
    }


In [7]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

def get_bert_embedding(tokens):
    tokenized_text = tokenizer(" ".join(tokens), return_tensors="pt", padding=True, truncation=True, max_length=30)
    with torch.no_grad():
        output = bert_model(**tokenized_text).last_hidden_state
    return output.mean(dim=1).squeeze().numpy()  # Return feature vector

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [8]:
# Example URL
url = "https://secure-login.paypal.com/verify_account.html"

# Tokenization
tokens = tokenize_url(url)

# Extract features
stat_features = extract_statistical_features(url)
top_keywords = extract_text_rank_keywords(tokens)
bert_features = get_bert_embedding(tokens)

# Print Results
print("Tokens:", tokens)
print("Statistical Features:", stat_features)
print("Top Ranked Keywords (TextRank):", top_keywords)
print("BERT Feature Vector Shape:", bert_features.shape)


Tokens: ['secure', 'login', 'paypal', 'com', 'verify', 'account', 'html']
Statistical Features: {'url_length': 51, 'domain_length': 23, 'num_special_chars': 5, 'num_subdomains': 3, 'has_ip_address': 0}
Top Ranked Keywords (TextRank): ['paypal', 'verify', 'com', 'login', 'account']
BERT Feature Vector Shape: (768,)
