# Import Libraries

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tldextract
from urllib.parse import urlparse
import re
from sklearn.model_selection import train_test_split


# Load dataset

In [5]:
df = pd.read_csv("combined_data.csv")

In [6]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,url,Type,label
0,0,0,https://docs.google.com/presentation/d/e/2PACX...,phishing,1
1,1,1,https://btttelecommunniccatiion.weeblysite.com/,phishing,1
2,2,2,https://kq0hgp.webwave.dev/,phishing,1
3,3,3,https://brittishtele1bt-69836.getresponsesite....,phishing,1
4,4,4,https://bt-internet-105056.weeblysite.com/,phishing,1
...,...,...,...,...,...
504977,504977,345732,https://koreantaekwondo.tripod.com/,legitimate,0
504978,504978,345733,https://taekwondo.wisebytes.net/,legitimate,0
504979,504979,345734,https://www.steveconway.net/,legitimate,0
504980,504980,345735,https://www.tkd.net/,legitimate,0


In [7]:
df = df[['url','Type','label']]

In [8]:
df['label'].value_counts()

label
0    345737
1    159245
Name: count, dtype: int64

In [9]:
# Find domain of URLs by using tldextract
df['domain'] = df['url'].apply(lambda x: tldextract.extract(x).registered_domain)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['domain'] = df['url'].apply(lambda x: tldextract.extract(x).registered_domain)


In [10]:
df['domain'].value_counts()

domain
wikipedia.org                 12895
youtube.com                    8627
facebook.com                   8280
blogspot.com                   7655
google.com                     6001
                              ...  
centraleurope.org                 1
centralamericanairways.com        1
central71.com                     1
centerstage.net                   1
taekwondobible.com                1
Name: count, Length: 139679, dtype: int64

In [11]:
# Find unique domains 
unique_domains = df['domain'].unique()

In [12]:
len(unique_domains)

139679

In [13]:
# train test split on unique domain
train_domains, test_domains = train_test_split(unique_domains,test_size=0.2, random_state=42)

In [14]:
len(train_domains)

111743

In [15]:
len(test_domains)

27936

In [16]:
train_df_with_label = df[df['domain'].isin(train_domains)]
test_df_with_label  = df[df['domain'].isin(test_domains)]

In [17]:
train_df_with_label

Unnamed: 0,url,Type,label,domain
1,https://btttelecommunniccatiion.weeblysite.com/,phishing,1,weeblysite.com
2,https://kq0hgp.webwave.dev/,phishing,1,webwave.dev
3,https://brittishtele1bt-69836.getresponsesite....,phishing,1,getresponsesite.com
4,https://bt-internet-105056.weeblysite.com/,phishing,1,weeblysite.com
5,https://teleej.weebly.com/,phishing,1,weebly.com
...,...,...,...,...
504977,https://koreantaekwondo.tripod.com/,legitimate,0,tripod.com
504978,https://taekwondo.wisebytes.net/,legitimate,0,wisebytes.net
504979,https://www.steveconway.net/,legitimate,0,steveconway.net
504980,https://www.tkd.net/,legitimate,0,tkd.net


In [18]:
train_df_with_label.shape

(393623, 4)

In [19]:
test_df_with_label.shape

(111359, 4)

# Function to extract features from URLs

In [20]:
def extract_features(url):
    features = {}
    if not url.startswith('http'):
        url = 'http://' + url

    try:
        parsed_url = urlparse(url)
        domain_parts = tldextract.extract(url)
        domain_name = domain_parts.domain + '.' + domain_parts.suffix
        subdomain = domain_parts.subdomain
        
        features['url_length'] = len(url)
        features['hostname_length'] = len(parsed_url.netloc)
        features['path_length'] = len(parsed_url.path)
        features['count_dot'] = url.count('.')
        features['count_dash'] = url.count('-')
        features['count_underscore'] = url.count('_')
        features['count_slash'] = url.count('/')
        features['count_question'] = url.count('?')
        features['count_equals'] = url.count('=')
        features['count_at'] = url.count('@')
        features['count_ampersand'] = url.count('&')
        features['num_subdomains'] = subdomain.count('.') + 1 if subdomain else 0
        features['uses_https'] = 1 if parsed_url.scheme == 'https' else 0
        features['domain_length'] = len(domain_name)
        features['domain_has_digits'] = 1 if any(char.isdigit() for char in domain_name) else 0
        features['domain_has_non_ascii'] = 1 if any(ord(char) > 127 for char in domain_name) else 0

    except Exception as e:
         print(f"Error parsing the URL {url}: {e}")
    return features

In [21]:
train_df = train_df_with_label['url'].apply(extract_features)
test_df = test_df_with_label['url'].apply(extract_features)

In [22]:
train_df= pd.DataFrame(train_df.tolist())

In [23]:
train_df

Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_dash,count_underscore,count_slash,count_question,count_equals,count_at,count_ampersand,num_subdomains,uses_https,domain_length,domain_has_digits,domain_has_non_ascii
0,47,38,1,2,0,0,3,0,0,0,0,1,1,14,0,0
1,27,18,1,2,0,0,3,0,0,0,0,1,1,11,0,0
2,50,41,1,2,1,0,3,0,0,0,0,1,1,19,0,0
3,42,33,1,2,2,0,3,0,0,0,0,1,1,14,0,0
4,26,17,1,2,0,0,3,0,0,0,0,1,1,10,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393618,35,26,1,2,0,0,3,0,0,0,0,1,1,10,0,0
393619,32,23,1,2,0,0,3,0,0,0,0,1,1,13,0,0
393620,28,19,1,2,0,0,3,0,0,0,0,1,1,15,0,0
393621,20,11,1,2,0,0,3,0,0,0,0,1,1,7,0,0


In [26]:
train_df.to_csv('train_df.csv')

In [24]:
test_df= pd.DataFrame(test_df.tolist())

In [25]:
test_df

Unnamed: 0,url_length,hostname_length,path_length,count_dot,count_dash,count_underscore,count_slash,count_question,count_equals,count_at,count_ampersand,num_subdomains,uses_https,domain_length,domain_has_digits,domain_has_non_ascii
0,178,15,108,3,3,1,7,1,4,0,3,1,1,10,0,0
1,178,15,108,3,2,1,7,1,4,0,3,1,1,10,0,0
2,178,15,108,3,3,2,7,1,4,0,3,1,1,10,0,0
3,178,15,108,3,1,1,7,1,4,0,3,1,1,10,0,0
4,167,15,108,2,3,1,7,1,3,0,2,1,1,10,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111354,29,20,1,2,0,0,3,0,0,0,0,1,1,16,0,0
111355,28,19,1,2,0,0,3,0,0,0,0,1,1,15,0,0
111356,26,17,1,2,0,0,3,0,0,0,0,1,1,13,0,0
111357,35,13,14,3,0,1,3,0,0,0,0,1,1,9,0,0


In [None]:
test_df.to_csv('test_df.csv')