In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import numpy as np
from urllib.parse import urlparse
import re
import requests
from bs4 import BeautifulSoup

In [3]:
df_train = pd.read_parquet("Training.parquet")
df_test = pd.read_parquet("Testing.parquet")

In [4]:
funky_train = df_train[["url", "status"]]
funky_test = df_test[["url", "status"]]

1) Training Data 

In [5]:
#funky_train = funky_train[["url", "status"]]
funky_train

Unnamed: 0,url,status
0,https://www.todayshomeowner.com/how-to-make-ho...,legitimate
1,http://thapthan.ac.th/information/confirmation...,phishing
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,phishing
3,https://www.bedslide.com,legitimate
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,legitimate
...,...,...
7653,https://snip.ly/www.netflix.com-signIn-account...,phishing
7654,http://webchat.freenode.net/,legitimate
7655,http://mr-statucki.com/wp-content/uploads/2009...,phishing
7656,https://www.computerhope.com/jargon/c/cdrom.htm,legitimate


In [6]:

#MAIN FUNCTION DEPENDANCIES
# def count_hyperlinks(url):
#     try:
#         response = requests.get(url)
#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')
#             return len(soup.find_all('a'))
#     except Exception as e:
#         print(f"Error fetching URL {url}: {e}")
#     return 0

In [7]:
#building the function
#all binary classifications are 1=true/yes 0=false/no

def extract_features(funky_train):

    funky_train['url_length'] = funky_train['url'].apply(lambda x: len(x)) #grabs URL length

    funky_train['host_length'] = funky_train['url'].apply(lambda x: len(urlparse(x).hostname) if urlparse(x).hostname else 0) #grabs host url length
    
    #counting features of url
    funky_train['u_dots'] = funky_train['url'].apply(lambda x: x.count('.'))
    funky_train['u_hyphens'] = funky_train['url'].apply(lambda x: x.count('-'))
    funky_train['u_at'] = funky_train['url'].apply(lambda x: x.count('@'))
    funky_train['u_and'] = funky_train['url'].apply(lambda x: x.count('&'))
    funky_train['u_at'] = funky_train['url'].apply(lambda x: x.count('@'))
    funky_train['u_underscore'] = funky_train['url'].apply(lambda x: x.count('_'))
    funky_train['u_tilde'] = funky_train['url'].apply(lambda x: x.count('~'))
    funky_train['u_percent'] = funky_train['url'].apply(lambda x: x.count('%'))
    funky_train['u_slash'] = funky_train['url'].apply(lambda x: x.count('/'))
    funky_train['u_star'] = funky_train['url'].apply(lambda x: x.count('*'))
    funky_train['u_colon'] = funky_train['url'].apply(lambda x: x.count(':'))
    funky_train['u_comma'] = funky_train['url'].apply(lambda x: x.count(','))
    funky_train['u_semicolon'] = funky_train['url'].apply(lambda x: x.count(';'))
    funky_train['u_dollar'] = funky_train['url'].apply(lambda x: x.count('$'))
    funky_train['u_space'] = funky_train['url'].apply(lambda x: x.count(' '))
    funky_train['u_www'] = funky_train['url'].apply(lambda x: x.count('www.'))
    funky_train['u_com'] = funky_train['url'].apply(lambda x: x.count('.com'))
    funky_train['u_http'] = funky_train['url'].apply(lambda x: x.count('http:'))

    funky_train['number_count'] = funky_train['url'].apply(lambda x: sum(c.isdigit() for c in x))
    funky_train['numbers_to_length_ratio'] = (funky_train['number_count'] / funky_train['url_length']).round(2)

    funky_train['u_port'] = funky_train['url'].apply(lambda x: 1 if urlparse(x).port is not None else 0) #port in url binary

    # funky_train['tld_in_path'] = funky_train.apply(lambda row: urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).path, axis=1)
    # funky_train['tld_in_subdomain'] = funky_train.apply(lambda row: urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).hostname.split('.')[0] if urlparse(row['url']).hostname else '', axis=1)
    
    #see if tld is present in path or subdomain
    funky_train['tld_in_path'] = funky_train.apply(lambda row: 1 if urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).path else 0, axis=1)
    funky_train['tld_in_subdomain'] = funky_train.apply(lambda row: 1 if urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).hostname.split('.')[0] else 0 if urlparse(row['url']).hostname else 0, axis=1)
    
    #funky_train['nb_hyperlinks'] = funky_train['url'].apply(count_hyperlinks)

    funky_train['longest_word_host'] = funky_train['url'].apply(lambda x: max(len(word) for word in urlparse(x).hostname.split('.')))
    funky_train['longest_word_raw'] = funky_train['url'].apply(lambda x: max(len(word) for word in urlparse(x).netloc.split('.')))
    funky_train['shortest_word_host'] = funky_train['url'].apply(lambda x: min(len(word) for word in urlparse(x).hostname.split('.')))
    funky_train['shortest_word_raw'] = funky_train['url'].apply(lambda x: min(len(word) for word in urlparse(x).netloc.split('.')))
    
    #average word length in the host
    funky_train['avg_word_host'] = funky_train['url'].apply(lambda x: sum(len(word) for word in urlparse(x).hostname.split('.')) / len(urlparse(x).hostname.split('.'))).round(2)

    #punycode
    funky_train['is_punycode'] = funky_train['url'].apply(lambda x: 1 if 'xn--' in urlparse(x).netloc else 0)



    return funky_train

In [8]:
training = extract_features(funky_train)
training

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  funky_train['url_length'] = funky_train['url'].apply(lambda x: len(x)) #grabs URL length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  funky_train['host_length'] = funky_train['url'].apply(lambda x: len(urlparse(x).hostname) if urlparse(x).hostname else 0) #grabs host url length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

Unnamed: 0,url,status,url_length,host_length,u_dots,u_hyphens,u_at,u_and,u_underscore,u_tilde,...,numbers_to_length_ratio,u_port,tld_in_path,tld_in_subdomain,longest_word_host,longest_word_raw,shortest_word_host,shortest_word_raw,avg_word_host,is_punycode
0,https://www.todayshomeowner.com/how-to-make-ho...,legitimate,82,23,2,7,0,0,0,0,...,0.00,0,0,0,15,15,3,3,7.00,0
1,http://thapthan.ac.th/information/confirmation...,phishing,93,14,2,0,0,0,0,0,...,0.26,0,0,1,8,8,2,2,4.00,0
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,phishing,121,21,3,0,0,0,0,0,...,0.37,0,0,0,13,13,3,3,6.33,0
3,https://www.bedslide.com,legitimate,24,16,2,0,0,0,0,0,...,0.00,0,0,0,8,8,3,3,4.67,0
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,legitimate,73,24,3,1,0,0,5,0,...,0.00,0,0,0,15,15,3,3,7.33,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7653,https://snip.ly/www.netflix.com-signIn-account...,phishing,63,7,3,2,0,0,0,0,...,0.03,0,0,0,4,4,2,2,3.00,0
7654,http://webchat.freenode.net/,legitimate,28,20,2,0,0,0,0,0,...,0.00,0,0,0,8,8,3,3,6.00,0
7655,http://mr-statucki.com/wp-content/uploads/2009...,phishing,67,15,2,2,0,0,0,0,...,0.06,0,0,0,11,11,3,3,7.00,0
7656,https://www.computerhope.com/jargon/c/cdrom.htm,legitimate,47,20,3,0,0,0,0,0,...,0.00,0,0,0,12,12,3,3,6.00,0


In [9]:
clean_training = training.copy()
clean_training['status'] = clean_training['status'].replace({"legitimate":1, "phishing":0})
clean_training = clean_training.drop(columns=['url'])

In [10]:
clean_training

Unnamed: 0,status,url_length,host_length,u_dots,u_hyphens,u_at,u_and,u_underscore,u_tilde,u_percent,...,numbers_to_length_ratio,u_port,tld_in_path,tld_in_subdomain,longest_word_host,longest_word_raw,shortest_word_host,shortest_word_raw,avg_word_host,is_punycode
0,1,82,23,2,7,0,0,0,0,0,...,0.00,0,0,0,15,15,3,3,7.00,0
1,0,93,14,2,0,0,0,0,0,0,...,0.26,0,0,1,8,8,2,2,4.00,0
2,0,121,21,3,0,0,0,0,0,0,...,0.37,0,0,0,13,13,3,3,6.33,0
3,1,24,16,2,0,0,0,0,0,0,...,0.00,0,0,0,8,8,3,3,4.67,0
4,1,73,24,3,1,0,0,5,0,0,...,0.00,0,0,0,15,15,3,3,7.33,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7653,0,63,7,3,2,0,0,0,0,0,...,0.03,0,0,0,4,4,2,2,3.00,0
7654,1,28,20,2,0,0,0,0,0,0,...,0.00,0,0,0,8,8,3,3,6.00,0
7655,0,67,15,2,2,0,0,0,0,0,...,0.06,0,0,0,11,11,3,3,7.00,0
7656,1,47,20,3,0,0,0,0,0,0,...,0.00,0,0,0,12,12,3,3,6.00,0


2) Testing data

In [11]:
#funky_test = funky_test[["url", "status"]]
funky_test

Unnamed: 0,url,status
0,https://clubedemilhagem.com/home.php,phishing
1,http://www.medicalnewstoday.com/articles/18893...,legitimate
2,https://en.wikipedia.org/wiki/NBC_Nightly_News,legitimate
3,http://secure.web894.com/customer_center/custo...,phishing
4,https://en.wikipedia.org/wiki/Transaction_proc...,legitimate
...,...,...
3767,http://www.sublimefrequencies.com/,legitimate
3768,http://koei.wikia.com/wiki/Dynasty_Warriors:_U...,legitimate
3769,https://www.motorzona.ru/,legitimate
3770,https://login.microsoftonline.com/aa687de1-52b...,legitimate


In [12]:
#building the function
#all binary classifications are 1=true/yes 0=false/no

def extract_features(funky_test):

    funky_test['url_length'] = funky_test['url'].apply(lambda x: len(x)) #grabs URL length

    funky_test['host_length'] = funky_test['url'].apply(lambda x: len(urlparse(x).hostname) if urlparse(x).hostname else 0) #grabs host url length
    
    #counting features of url
    funky_test['u_dots'] = funky_test['url'].apply(lambda x: x.count('.'))
    funky_test['u_hyphens'] = funky_test['url'].apply(lambda x: x.count('-'))
    funky_test['u_at'] = funky_test['url'].apply(lambda x: x.count('@'))
    funky_test['u_and'] = funky_test['url'].apply(lambda x: x.count('&'))
    funky_test['u_at'] = funky_test['url'].apply(lambda x: x.count('@'))
    funky_test['u_underscore'] = funky_test['url'].apply(lambda x: x.count('_'))
    funky_test['u_tilde'] = funky_test['url'].apply(lambda x: x.count('~'))
    funky_test['u_percent'] = funky_test['url'].apply(lambda x: x.count('%'))
    funky_test['u_slash'] = funky_test['url'].apply(lambda x: x.count('/'))
    funky_test['u_star'] = funky_test['url'].apply(lambda x: x.count('*'))
    funky_test['u_colon'] = funky_test['url'].apply(lambda x: x.count(':'))
    funky_test['u_comma'] = funky_test['url'].apply(lambda x: x.count(','))
    funky_test['u_semicolon'] = funky_test['url'].apply(lambda x: x.count(';'))
    funky_test['u_dollar'] = funky_test['url'].apply(lambda x: x.count('$'))
    funky_test['u_space'] = funky_test['url'].apply(lambda x: x.count(' '))
    funky_test['u_www'] = funky_test['url'].apply(lambda x: x.count('www.'))
    funky_test['u_com'] = funky_test['url'].apply(lambda x: x.count('.com'))
    funky_test['u_http'] = funky_test['url'].apply(lambda x: x.count('http:'))

    funky_test['number_count'] = funky_test['url'].apply(lambda x: sum(c.isdigit() for c in x))
    funky_test['numbers_to_length_ratio'] = (funky_test['number_count'] / funky_test['url_length']).round(2)

    funky_test['u_port'] = funky_test['url'].apply(lambda x: 1 if urlparse(x).port is not None else 0) #port in url binary

    # funky_test['tld_in_path'] = funky_test.apply(lambda row: urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).path, axis=1)
    # funky_test['tld_in_subdomain'] = funky_test.apply(lambda row: urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).hostname.split('.')[0] if urlparse(row['url']).hostname else '', axis=1)
    
    #see if tld is present in path or subdomain (tld is .com, .net, etc)
    funky_test['tld_in_path'] = funky_test.apply(lambda row: 1 if urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).path else 0, axis=1)
    funky_test['tld_in_subdomain'] = funky_test.apply(lambda row: 1 if urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).hostname.split('.')[0] else 0 if urlparse(row['url']).hostname else 0, axis=1)
    
    #funky_test['nb_hyperlinks'] = funky_test['url'].apply(count_hyperlinks)

    funky_test['longest_word_host'] = funky_test['url'].apply(lambda x: max(len(word) for word in urlparse(x).hostname.split('.')))
    funky_test['longest_word_raw'] = funky_test['url'].apply(lambda x: max(len(word) for word in urlparse(x).netloc.split('.')))
    funky_test['shortest_word_host'] = funky_test['url'].apply(lambda x: min(len(word) for word in urlparse(x).hostname.split('.')))
    funky_test['shortest_word_raw'] = funky_test['url'].apply(lambda x: min(len(word) for word in urlparse(x).netloc.split('.')))
    
    #average word length in the host
    funky_test['avg_word_host'] = funky_test['url'].apply(lambda x: sum(len(word) for word in urlparse(x).hostname.split('.')) / len(urlparse(x).hostname.split('.'))).round(2)

    #punycode
    funky_test['is_punycode'] = funky_test['url'].apply(lambda x: 1 if 'xn--' in urlparse(x).netloc else 0)



    return funky_test

In [13]:
testing = extract_features(funky_test)
testing

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  funky_test['url_length'] = funky_test['url'].apply(lambda x: len(x)) #grabs URL length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  funky_test['host_length'] = funky_test['url'].apply(lambda x: len(urlparse(x).hostname) if urlparse(x).hostname else 0) #grabs host url length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

Unnamed: 0,url,status,url_length,host_length,u_dots,u_hyphens,u_at,u_and,u_underscore,u_tilde,...,numbers_to_length_ratio,u_port,tld_in_path,tld_in_subdomain,longest_word_host,longest_word_raw,shortest_word_host,shortest_word_raw,avg_word_host,is_punycode
0,https://clubedemilhagem.com/home.php,phishing,36,19,2,0,0,0,0,0,...,0.00,0,0,0,15,15,3,3,9.00,0
1,http://www.medicalnewstoday.com/articles/18893...,legitimate,51,24,3,0,0,0,0,0,...,0.12,0,0,0,16,16,3,3,7.33,0
2,https://en.wikipedia.org/wiki/NBC_Nightly_News,legitimate,46,16,2,0,0,0,2,0,...,0.00,0,0,0,9,9,2,2,4.67,0
3,http://secure.web894.com/customer_center/custo...,phishing,185,17,2,1,0,2,2,0,...,0.25,0,0,0,6,6,3,3,5.00,0
4,https://en.wikipedia.org/wiki/Transaction_proc...,legitimate,52,16,2,0,0,0,1,0,...,0.00,0,0,0,9,9,2,2,4.67,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,http://www.sublimefrequencies.com/,legitimate,34,26,2,0,0,0,0,0,...,0.00,0,0,0,18,18,3,3,8.00,0
3768,http://koei.wikia.com/wiki/Dynasty_Warriors:_U...,legitimate,54,14,2,0,0,0,2,0,...,0.00,0,0,0,5,5,3,3,4.00,0
3769,https://www.motorzona.ru/,legitimate,25,16,2,0,0,0,0,0,...,0.00,0,0,0,9,9,2,2,4.67,0
3770,https://login.microsoftonline.com/aa687de1-52b...,legitimate,550,25,5,24,0,9,7,0,...,0.35,0,0,0,15,15,3,3,7.67,0


In [14]:
clean_testing = testing.copy()
clean_testing['status'] = clean_testing['status'].replace({"legitimate":1, "phishing":0})
clean_testing = clean_testing.drop(columns=['url'])

In [15]:
clean_testing

Unnamed: 0,status,url_length,host_length,u_dots,u_hyphens,u_at,u_and,u_underscore,u_tilde,u_percent,...,numbers_to_length_ratio,u_port,tld_in_path,tld_in_subdomain,longest_word_host,longest_word_raw,shortest_word_host,shortest_word_raw,avg_word_host,is_punycode
0,0,36,19,2,0,0,0,0,0,0,...,0.00,0,0,0,15,15,3,3,9.00,0
1,1,51,24,3,0,0,0,0,0,0,...,0.12,0,0,0,16,16,3,3,7.33,0
2,1,46,16,2,0,0,0,2,0,0,...,0.00,0,0,0,9,9,2,2,4.67,0
3,0,185,17,2,1,0,2,2,0,0,...,0.25,0,0,0,6,6,3,3,5.00,0
4,1,52,16,2,0,0,0,1,0,0,...,0.00,0,0,0,9,9,2,2,4.67,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,1,34,26,2,0,0,0,0,0,0,...,0.00,0,0,0,18,18,3,3,8.00,0
3768,1,54,14,2,0,0,0,2,0,0,...,0.00,0,0,0,5,5,3,3,4.00,0
3769,1,25,16,2,0,0,0,0,0,0,...,0.00,0,0,0,9,9,2,2,4.67,0
3770,1,550,25,5,24,0,9,7,0,5,...,0.35,0,0,0,15,15,3,3,7.67,0


3) Building the model

In [16]:
y = clean_training["status"].values
X = clean_training.drop(columns="status").values

yy = clean_testing["status"].values
XX = clean_testing.drop(columns="status").values

y_train = y
X_train = X

y_test = yy
X_test = XX

scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7059 - loss: 0.5398
Epoch 2/100
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8169 - loss: 0.3859
Epoch 3/100
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8314 - loss: 0.3615
Epoch 4/100
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8456 - loss: 0.3387
Epoch 5/100
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8498 - loss: 0.3400
Epoch 6/100
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8554 - loss: 0.3254
Epoch 7/100
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8648 - loss: 0.3111
Epoch 8/100
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8654 - loss: 0.3071
Epoch 9/100
[1m240/240[0m [32

In [20]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

118/118 - 0s - 3ms/step - accuracy: 0.8741 - loss: 0.3559
Loss: 0.3558601140975952, Accuracy: 0.8740721344947815


4) Inputting new URLs

In [109]:
new_url_user = "https://chatgpt.com/c/18c7e5ab-d898-46a4-ad94-b14aba163f15" #user stuff goes here

new_url_base = [new_url_user]

funky_new = pd.DataFrame()
funky_new['url'] = new_url_base

funky_new

Unnamed: 0,url
0,https://chatgpt.com/c/18c7e5ab-d898-46a4-ad94-...


In [110]:
#building the function
#all binary classifications are 1=true/yes 0=false/no

def extract_features(funky_new):

    funky_new['url_length'] = funky_new['url'].apply(lambda x: len(x)) #grabs URL length

    funky_new['host_length'] = funky_new['url'].apply(lambda x: len(urlparse(x).hostname) if urlparse(x).hostname else 0) #grabs host url length
    
    #counting features of url
    funky_new['u_dots'] = funky_new['url'].apply(lambda x: x.count('.'))
    funky_new['u_hyphens'] = funky_new['url'].apply(lambda x: x.count('-'))
    funky_new['u_at'] = funky_new['url'].apply(lambda x: x.count('@'))
    funky_new['u_and'] = funky_new['url'].apply(lambda x: x.count('&'))
    funky_new['u_at'] = funky_new['url'].apply(lambda x: x.count('@'))
    funky_new['u_underscore'] = funky_new['url'].apply(lambda x: x.count('_'))
    funky_new['u_tilde'] = funky_new['url'].apply(lambda x: x.count('~'))
    funky_new['u_percent'] = funky_new['url'].apply(lambda x: x.count('%'))
    funky_new['u_slash'] = funky_new['url'].apply(lambda x: x.count('/'))
    funky_new['u_star'] = funky_new['url'].apply(lambda x: x.count('*'))
    funky_new['u_colon'] = funky_new['url'].apply(lambda x: x.count(':'))
    funky_new['u_comma'] = funky_new['url'].apply(lambda x: x.count(','))
    funky_new['u_semicolon'] = funky_new['url'].apply(lambda x: x.count(';'))
    funky_new['u_dollar'] = funky_new['url'].apply(lambda x: x.count('$'))
    funky_new['u_space'] = funky_new['url'].apply(lambda x: x.count(' '))
    funky_new['u_www'] = funky_new['url'].apply(lambda x: x.count('www.'))
    funky_new['u_com'] = funky_new['url'].apply(lambda x: x.count('.com'))
    funky_new['u_http'] = funky_new['url'].apply(lambda x: x.count('http:'))

    funky_new['number_count'] = funky_new['url'].apply(lambda x: sum(c.isdigit() for c in x))
    funky_new['numbers_to_length_ratio'] = (funky_new['number_count'] / funky_new['url_length']).round(2)

    funky_new['u_port'] = funky_new['url'].apply(lambda x: 1 if urlparse(x).port is not None else 0) #port in url binary

    # funky_new['tld_in_path'] = funky_new.apply(lambda row: urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).path, axis=1)
    # funky_new['tld_in_subdomain'] = funky_new.apply(lambda row: urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).hostname.split('.')[0] if urlparse(row['url']).hostname else '', axis=1)
    
    #see if tld is present in path or subdomain
    funky_new['tld_in_path'] = funky_new.apply(lambda row: 1 if urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).path else 0, axis=1)
    funky_new['tld_in_subdomain'] = funky_new.apply(lambda row: 1 if urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).hostname.split('.')[0] else 0 if urlparse(row['url']).hostname else 0, axis=1)
    
    #funky_new['nb_hyperlinks'] = funky_new['url'].apply(count_hyperlinks)

    funky_new['longest_word_host'] = funky_new['url'].apply(lambda x: max(len(word) for word in urlparse(x).hostname.split('.')))
    funky_new['longest_word_raw'] = funky_new['url'].apply(lambda x: max(len(word) for word in urlparse(x).netloc.split('.')))
    funky_new['shortest_word_host'] = funky_new['url'].apply(lambda x: min(len(word) for word in urlparse(x).hostname.split('.')))
    funky_new['shortest_word_raw'] = funky_new['url'].apply(lambda x: min(len(word) for word in urlparse(x).netloc.split('.')))
    
    #average word length in the host
    funky_new['avg_word_host'] = funky_new['url'].apply(lambda x: sum(len(word) for word in urlparse(x).hostname.split('.')) / len(urlparse(x).hostname.split('.'))).round(2)

    #punycode
    funky_new['is_punycode'] = funky_new['url'].apply(lambda x: 1 if 'xn--' in urlparse(x).netloc else 0)



    return funky_new

In [111]:
new_url = extract_features(funky_new)
clean_url = new_url.drop(columns=['url'])
clean_url


Unnamed: 0,url_length,host_length,u_dots,u_hyphens,u_at,u_and,u_underscore,u_tilde,u_percent,u_slash,...,numbers_to_length_ratio,u_port,tld_in_path,tld_in_subdomain,longest_word_host,longest_word_raw,shortest_word_host,shortest_word_raw,avg_word_host,is_punycode
0,58,11,1,4,0,0,0,0,0,4,...,0.33,0,0,0,7,7,3,3,5.0,0


In [115]:
def predict_url(new_url):
    
    #clean_url = new_url.drop(columns=['url'])
    # Make predictions
    predictions = nn.predict(clean_url)
    print(f"Predictions: {predictions}")

    # Interpret the predictions (assuming binary classification with a threshold of 0.5)
    is_phishing = predictions[0][0] < 0.5  # if probability < 0.5, it's phishing (0), otherwise legitimate (1)
    return 0 if is_phishing else 1


In [116]:
# Use the prediction function
#new_url_user = "https://downdetector.com/status/openai/"

result = predict_url(clean_url)
print(f"The URL '{new_url_user}' is {'legitimate' if result == 1 else 'phishing'}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Predictions: [[0.]]
The URL 'https://chatgpt.com/c/18c7e5ab-d898-46a4-ad94-b14aba163f15' is phishing


In [113]:
#new_url = [put new url here]

#new_features = extract_features(new_url)  # extract features from new URLs

#clean_url = new_url.drop(columns=['url'])

#predictions = nn.predict(clean_url)  # make predictions

# for url, prediction in zip(clean_url, predictions):
#      if prediction == 1:
#          print(f"{url}: Safe")
#      else:
#          print(f"{url}: Phishing")

url_length: Phishing


In [118]:
def extract_features(urls):
    funky_new = pd.DataFrame()
    funky_new['url'] = urls
    
    funky_new['url_length'] = funky_new['url'].apply(lambda x: len(x))
    funky_new['host_length'] = funky_new['url'].apply(lambda x: len(urlparse(x).hostname) if urlparse(x).hostname else 0)
    funky_new['u_dots'] = funky_new['url'].apply(lambda x: x.count('.'))
    funky_new['u_hyphens'] = funky_new['url'].apply(lambda x: x.count('-'))
    funky_new['u_at'] = funky_new['url'].apply(lambda x: x.count('@'))
    funky_new['u_and'] = funky_new['url'].apply(lambda x: x.count('&'))
    funky_new['u_underscore'] = funky_new['url'].apply(lambda x: x.count('_'))
    funky_new['u_tilde'] = funky_new['url'].apply(lambda x: x.count('~'))
    funky_new['u_percent'] = funky_new['url'].apply(lambda x: x.count('%'))
    funky_new['u_slash'] = funky_new['url'].apply(lambda x: x.count('/'))
    funky_new['u_star'] = funky_new['url'].apply(lambda x: x.count('*'))
    funky_new['u_colon'] = funky_new['url'].apply(lambda x: x.count(':'))
    funky_new['u_comma'] = funky_new['url'].apply(lambda x: x.count(','))
    funky_new['u_semicolon'] = funky_new['url'].apply(lambda x: x.count(';'))
    funky_new['u_dollar'] = funky_new['url'].apply(lambda x: x.count('$'))
    funky_new['u_space'] = funky_new['url'].apply(lambda x: x.count(' '))
    funky_new['u_www'] = funky_new['url'].apply(lambda x: x.count('www.'))
    funky_new['u_com'] = funky_new['url'].apply(lambda x: x.count('.com'))
    funky_new['u_http'] = funky_new['url'].apply(lambda x: x.count('http:'))
    funky_new['number_count'] = funky_new['url'].apply(lambda x: sum(c.isdigit() for c in x))
    funky_new['numbers_to_length_ratio'] = (funky_new['number_count'] / funky_new['url_length']).round(2)
    funky_new['u_port'] = funky_new['url'].apply(lambda x: 1 if urlparse(x).port is not None else 0)
    funky_new['tld_in_path'] = funky_new.apply(lambda row: 1 if urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).path else 0, axis=1)
    funky_new['tld_in_subdomain'] = funky_new.apply(lambda row: 1 if urlparse(row['url']).netloc.split('.')[-1] in urlparse(row['url']).hostname.split('.')[0] else 0 if urlparse(row['url']).hostname else 0, axis=1)
    funky_new['longest_word_host'] = funky_new['url'].apply(lambda x: max(len(word) for word in urlparse(x).hostname.split('.')))
    funky_new['longest_word_raw'] = funky_new['url'].apply(lambda x: max(len(word) for word in urlparse(x).netloc.split('.')))
    funky_new['shortest_word_host'] = funky_new['url'].apply(lambda x: min(len(word) for word in urlparse(x).hostname.split('.')))
    funky_new['shortest_word_raw'] = funky_new['url'].apply(lambda x: min(len(word) for word in urlparse(x).netloc.split('.')))
    funky_new['avg_word_host'] = funky_new['url'].apply(lambda x: sum(len(word) for word in urlparse(x).hostname.split('.')) / len(urlparse(x).hostname.split('.'))).round(2)
    funky_new['is_punycode'] = funky_new['url'].apply(lambda x: 1 if 'xn--' in urlparse(x).netloc else 0)
    
    return funky_new

# Load your trained model
#model = load_model('path_to_your_model.h5')

# Prediction function
def predict_url(url):
    # Prepare the data
    urls = [url]
    
    # Extract features
    features_df = extract_features(urls)
    
    # Drop the original url column as it is not used in prediction
    clean_url = features_df.drop(columns=['url'])
    
    # Print the shape of the input data
    print(f"Shape of the input data: {clean_url.shape}")
    print(f"Input data: {clean_url}")
    
    # Make predictions
    predictions = nn.predict(clean_url)
    
    # Print out predictions for debugging
    print(f"Predictions: {predictions}")
    
    # Interpret the predictions (assuming binary classification with a threshold of 0.5)
    is_phishing = predictions[0] < 0.5  # if probability < 0.5, it's phishing (0), otherwise legitimate (1)
    return 0 if is_phishing else 1

# Example usage
new_url_user = "https://downdetector.com/status/openai/"
result = predict_url(new_url_user)
print(f"The URL '{new_url_user}' is {'phishing' if result == 0 else 'legitimate'}")

Shape of the input data: (1, 30)
Input data:    url_length  host_length  u_dots  u_hyphens  u_at  u_and  u_underscore  \
0          39           16       1          0     0      0             0   

   u_tilde  u_percent  u_slash  ...  numbers_to_length_ratio  u_port  \
0        0          0        5  ...                      0.0       0   

   tld_in_path  tld_in_subdomain  longest_word_host  longest_word_raw  \
0            0                 0                 12                12   

   shortest_word_host  shortest_word_raw  avg_word_host  is_punycode  
0                   3                  3            7.5            0  

[1 rows x 30 columns]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Predictions: [[0.]]
The URL 'https://downdetector.com/status/openai/' is phishing


In [120]:
nn.summary()