Import the necessary Libraries

In [48]:
import pandas as pd
import numpy as np

Exploring Kaggle Dataset

In [49]:
# load the training dataset
train = pd.read_parquet("Training.parquet")
train.shape

(7658, 89)

In [50]:
# load the test dataset
test = pd.read_parquet("Testing.parquet")
test.shape

(3772, 89)

Combine the train and test datasets

In [51]:
combined_data = pd.concat([train, test], axis=0, ignore_index=True)
combined_data.shape

(11430, 89)

#### check the columns of the combined dataframe

In [52]:
combined_data.columns

Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 

- The dataset has 89 different columns
- the status column is the target column

#### create a new dataframe that has only the url and status columns

In [53]:
# Create a new DataFrame with only the 'url' and 'status' columns
url_status_df = combined_data[['url', 'status']]
url_status_df.head()

Unnamed: 0,url,status
0,https://www.todayshomeowner.com/how-to-make-ho...,legitimate
1,http://thapthan.ac.th/information/confirmation...,phishing
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,phishing
3,https://www.bedslide.com,legitimate
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,legitimate


#### convert status column to numerical

In [54]:
# Create a copy to avoid the SettingWithCopyWarning
url_status_df = url_status_df.copy()

# Map the 'status' column to numerical values
url_status_df['status'] = url_status_df['status'].map({'legitimate': 0, 'phishing': 1})
url_status_df.head()


Unnamed: 0,url,status
0,https://www.todayshomeowner.com/how-to-make-ho...,0
1,http://thapthan.ac.th/information/confirmation...,1
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,1
3,https://www.bedslide.com,0
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,0


#### check for duplicates

In [55]:
# Count the number of duplicate URLs
duplicate_count = url_status_df['url'].duplicated().sum()
print(f"Number of duplicate URLs: {duplicate_count}")

Number of duplicate URLs: 1


- The url_status_df has one observation duplicated.
- The duplicated row will be dropped after combining with the UCI Irvine dataset

#### inspect the unique values in the status column 

In [56]:
url_status_df['status'].unique()

array([0, 1], dtype=int64)

#### Check the value counts for both the legitimate and the legitimate and the phishing urls

In [57]:
url_status_df.status.value_counts()

1    5715
0    5715
Name: status, dtype: int64

- The dataset is balanced.
- It has 5715 legitimate urls and 5715 phishing urls

## Exploring the UCI Irvine Dataset

#### Loading the dataset

In [58]:
dataset2= pd.read_csv("PhiUSIIL_Phishing_URL_Dataset.csv")
dataset2.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1,0,1,244,15,34,72,1,85,1


#### 

#### check the columns of dataset 2

In [59]:
dataset2.columns

Index(['FILENAME', 'URL', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP',
       'TLD', 'URLSimilarityIndex', 'CharContinuationRate',
       'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain',
       'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio',
       'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL',
       'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL',
       'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'Title', 'DomainTitleMatchScore', 'URLTitleMatchScore',
       'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect',
       'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame',
       'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton',
       'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto',
       'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef',
       'NoOfEmptyRef', 'NoOf

#### create a new dataframe that has only the URL and the target variable

In [60]:
# Create a new DataFrame with only the 'url' and 'status' columns
url_label_df = dataset2[['URL', 'label']]
url_label_df.head()

Unnamed: 0,URL,label
0,https://www.southbankmosaics.com,1
1,https://www.uni-mainz.de,1
2,https://www.voicefmradio.co.uk,1
3,https://www.sfnmjournal.com,1
4,https://www.rewildingargentina.org,1


#### check the shape of the url_label_df

In [61]:
url_label_df.shape

(235795, 2)

- The dataset has 235,795 observations with the target column to determine whether the url is a legitimate or phishing url

#### check for duplicates

In [62]:
# Count the number of duplicate URLs
duplicate_count = url_label_df['URL'].duplicated().sum()
print(f"Number of duplicate URLs: {duplicate_count}")

Number of duplicate URLs: 425


- The dataset has 425 duplicates
- The duplicates will be dropped after combining the two datasets

### Merging both datasets

Here the kaggle dataset and the UCI Irvine dataset are marged before extracting the features for the combined dataset.

The datasets will be merged as follows:
1. Standardize Column Names: Rename similar columns in each dataset to a common name.
2. Concatenate: Use pd.concat to combine the datasets.

In [63]:
# Rename columns in url_label_df for consistency
url_label_df = url_label_df.rename(columns={'URL': 'url', 'label': 'status'})

# Concatenate the two DataFrames
combined_df = pd.concat([url_status_df, url_label_df], ignore_index=True)

combined_df.head()

Unnamed: 0,url,status
0,https://www.todayshomeowner.com/how-to-make-ho...,0
1,http://thapthan.ac.th/information/confirmation...,1
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,1
3,https://www.bedslide.com,0
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,0


#### check the shape of the combined dataframe

In [64]:
combined_df.shape

(247225, 2)

Checking for null values

In [65]:
combined_df.isna().sum()

url       0
status    0
dtype: int64

## Preprocessing

#### Checking whether there are duplicates in the combined dataframe

In [66]:
# Check for duplicates in the 'url' column
duplicates = combined_df.duplicated(subset='url', keep=False)  # keep=False marks all duplicates as True

# Display all rows with duplicate URLs
duplicate_urls = combined_df[duplicates]
print("Duplicate URLs:")
print(duplicate_urls)

Duplicate URLs:
                                                      url  status
261     https://app.box.com/s/x6agocx9zvj049azirk4aw3x...       1
303                         http://vxdse.myfreesites.net/       1
339                     http://repl-mess.myfreesites.net/       1
389                     http://site9423773.92.webydo.com/       1
485     http://www.imcreator.com/viewer/vbid-fa0f29d5-...       1
...                                                   ...     ...
246025                 https://outlook-web-fb782.web.app/       0
246710                    https://orange789.yolasite.com/       0
246848                http://uph0ldlgin.mystrikingly.com/       0
246931  https://objectstorage.ap-singapore-1.oracleclo...       0
247168    https://yellow-river-189b.lhziiz35.workers.dev/       0

[937 rows x 2 columns]


#### Count the Number of Duplicate URLs

In [67]:
# Count the number of duplicate URLs
duplicate_count = combined_df['url'].duplicated().sum()
print(f"Number of duplicate URLs: {duplicate_count}")

Number of duplicate URLs: 470


- The dataset has 420 duplicate URLS
- We therefore drop the duplicate URLS before extracting the features

#### Drop Duplicates

In [68]:
# Remove duplicates, keeping the first occurrence
combined_df_no_duplicates = combined_df.drop_duplicates(subset='url', keep='first')

print("DataFrame after removing duplicates:")
print(combined_df_no_duplicates)


DataFrame after removing duplicates:
                                                      url  status
0       https://www.todayshomeowner.com/how-to-make-ho...       0
1       http://thapthan.ac.th/information/confirmation...       1
2       http://app.dialoginsight.com/T/OFC4/L2S/3888/B...       1
3                                https://www.bedslide.com       0
4       https://tabs.ultimate-guitar.com/s/sex_pistols...       0
...                                                   ...     ...
247220                     https://www.skincareliving.com       1
247221                      https://www.winchester.gov.uk       1
247222                    https://www.nononsensedesign.be       1
247223  https://patient-cell-40f5.updatedlogmylogin.wo...       0
247224                 https://www.alternativefinland.com       1

[246755 rows x 2 columns]


check the shape of the new dataframe

In [69]:
combined_df_no_duplicates.shape

(246755, 2)

- The new dataframe has 246,755 unique urls

Check missing values in the dataset

In [70]:
combined_df_no_duplicates.isna().sum()

url       0
status    0
dtype: int64

## Feature Extraction 

### Extracting and Calculating Features from URLs

In [71]:
import re
import numpy as np
from urllib.parse import urlparse

# Function to extract various features from URL
def extract_detailed_url_features(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    path = parsed_url.path

    # Calculating features
    features = {
        'URLLength': len(url),
        'Domain': domain,
        'DomainLength': len(domain),
        'IsDomainIP': int(re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain) is not None),  # Check if domain is an IP
        'TLD': domain.split('.')[-1] if '.' in domain else '',
        'NoOfSubDomain': len(domain.split('.')) - 2 if '.' in domain else 0,
        'NoOfLettersInURL': sum(c.isalpha() for c in url),
        'LetterRatioInURL': sum(c.isalpha() for c in url) / len(url) if len(url) > 0 else 0,
        'NoOfDigitsInURL': sum(c.isdigit() for c in url),
        'DigitRatioInURL': sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
        'NoOfEqualsInURL': url.count('='),
        'NoOfQMarkInURL': url.count('?'),
        'NoOfAmpersandInURL': url.count('&'),
        'NoOfOtherSpecialCharsInURL': len(re.findall(r'[!@#$%^*()_+|~=`{}\[\]:";\'<>?,./]', url)),
        'SpecialCharRatioInURL': len(re.findall(r'[!@#$%^*()_+|~=`{}\[\]:";\'<>?,./]', url)) / len(url) if len(url) > 0 else 0,
        'IsHTTPS': int(url.startswith('https')),
        'NoOfURLRedirect': url.count('//') - 1,  # Counts '//' occurrences after "http://"
        'NoOfPopup': int('popup' in url),  # Basic presence check for 'popup' keyword
        'NoOfiFrame': int('iframe' in url),  # Basic presence check for 'iframe' keyword
        'HasSocialNet': int(any(net in url for net in ['facebook', 'twitter', 'linkedin', 'instagram', 'youtube'])),
    }
    
    # Features based on word analysis in URL
    words_raw = re.split(r'\W+', url)  # Split URL by non-alphanumeric characters
    word_lengths = [len(word) for word in words_raw if word]
    
    features.update({
        'length_words_raw': len(words_raw),
        'char_repeat': max([url.count(char) for char in set(url)]),  # Max count of any character
        'shortest_words_raw': min(word_lengths) if word_lengths else 0,
        'longest_words_raw': max(word_lengths) if word_lengths else 0,
        'avg_words_raw': np.mean(word_lengths) if word_lengths else 0,
    })
    
    return features

# Apply the function to extract features for each URL
url_features_df = combined_df_no_duplicates['url'].apply(extract_detailed_url_features).apply(pd.Series)

# Concatenate the extracted features with combined_df_no_duplicates
combined_df_no_duplicates = pd.concat([combined_df_no_duplicates.reset_index(drop=True), url_features_df], axis=1)


In [72]:
combined_df_no_duplicates.head()

Unnamed: 0,url,status,URLLength,Domain,DomainLength,IsDomainIP,TLD,NoOfSubDomain,NoOfLettersInURL,LetterRatioInURL,...,IsHTTPS,NoOfURLRedirect,NoOfPopup,NoOfiFrame,HasSocialNet,length_words_raw,char_repeat,shortest_words_raw,longest_words_raw,avg_words_raw
0,https://www.todayshomeowner.com/how-to-make-ho...,0.0,82.0,www.todayshomeowner.com,23.0,0.0,com,1.0,68.0,0.829268,...,1.0,0.0,0.0,0.0,0.0,13.0,9.0,2.0,15.0,5.666667
1,http://thapthan.ac.th/information/confirmation...,1.0,93.0,thapthan.ac.th,14.0,0.0,th,1.0,59.0,0.634409,...,0.0,0.0,0.0,0.0,0.0,9.0,8.0,2.0,32.0,9.222222
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,1.0,121.0,app.dialoginsight.com,21.0,0.0,com,1.0,55.0,0.454545,...,0.0,0.0,0.0,0.0,0.0,20.0,17.0,1.0,13.0,5.0
3,https://www.bedslide.com,0.0,24.0,www.bedslide.com,16.0,0.0,com,1.0,19.0,0.791667,...,1.0,0.0,0.0,0.0,0.0,4.0,3.0,3.0,8.0,4.75
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,0.0,73.0,tabs.ultimate-guitar.com,24.0,0.0,com,1.0,58.0,0.794521,...,1.0,0.0,0.0,0.0,0.0,9.0,9.0,1.0,22.0,7.0


#### Check for null values

In [73]:
combined_df_no_duplicates.isna().sum()

url                           467
status                        467
URLLength                     467
Domain                        467
DomainLength                  467
IsDomainIP                    467
TLD                           467
NoOfSubDomain                 467
NoOfLettersInURL              467
LetterRatioInURL              467
NoOfDigitsInURL               467
DigitRatioInURL               467
NoOfEqualsInURL               467
NoOfQMarkInURL                467
NoOfAmpersandInURL            467
NoOfOtherSpecialCharsInURL    467
SpecialCharRatioInURL         467
IsHTTPS                       467
NoOfURLRedirect               467
NoOfPopup                     467
NoOfiFrame                    467
HasSocialNet                  467
length_words_raw              467
char_repeat                   467
shortest_words_raw            467
longest_words_raw             467
avg_words_raw                 467
dtype: int64

#### drop rows with null values

In [74]:
cleaned_df_no_duplicates = combined_df_no_duplicates.dropna()
cleaned_df_no_duplicates.isna().sum()

url                           0
status                        0
URLLength                     0
Domain                        0
DomainLength                  0
IsDomainIP                    0
TLD                           0
NoOfSubDomain                 0
NoOfLettersInURL              0
LetterRatioInURL              0
NoOfDigitsInURL               0
DigitRatioInURL               0
NoOfEqualsInURL               0
NoOfQMarkInURL                0
NoOfAmpersandInURL            0
NoOfOtherSpecialCharsInURL    0
SpecialCharRatioInURL         0
IsHTTPS                       0
NoOfURLRedirect               0
NoOfPopup                     0
NoOfiFrame                    0
HasSocialNet                  0
length_words_raw              0
char_repeat                   0
shortest_words_raw            0
longest_words_raw             0
avg_words_raw                 0
dtype: int64

#### Basic URL and Domain Metrics:

- `URLLength:` Length of the entire URL.
- Domain: The domain portion of the URL.
- DomainLength: Length of the domain.
- IsDomainIP: Binary feature indicating if the domain is an IP address.
- TLD: The top-level domain (e.g., .com, .org).
- NoOfSubDomain: Number of subdomains in the domain.

#### Character Counts and Ratios:
- NoOfLettersInURL and LetterRatioInURL: Count and ratio of alphabetic characters.
- NoOfDigitsInURL and DigitRatioInURL: Count and ratio of digits.
- NoOfEqualsInURL, NoOfQMarkInURL, NoOfAmpersandInURL: Counts of specific characters.
- NoOfOtherSpecialCharsInURL and SpecialCharRatioInURL: Counts and ratio of special characters.
### Security and Redirect Indicators:

- IsHTTPS: Indicates if the URL uses HTTPS.
- NoOfURLRedirect: Counts // occurrences, often indicative of redirections.
- NoOfPopup and NoOfiFrame: Presence of keywords popup and iframe.
#### Word Analysis:

- length_words_raw: Number of words split by non-alphanumeric characters.
- char_repeat: The maximum count of any single character.
- shortest_words_raw, longest_words_raw, avg_words_raw: Statistics of word lengths in the URL.

### Scrapping web content

To extract features like domain_age, dns_record, google_index, page_rank, HasFavicon, HasTitle, and Title, we'll need to parse web content and, in some cases, interact with third-party services. Here’s how to approach each feature:

#### Extractin Domain Age

In [75]:
! pip install whois



In [76]:
import whois
from datetime import datetime

def get_domain_age(domain):
    try:
        domain_info = whois.whois(domain)
        creation_date = domain_info.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]  # Handle cases where multiple creation dates are returned
        domain_age = (datetime.now() - creation_date).days / 365  # Age in years
        return domain_age
    except:
        return None  # Return None if the WHOIS lookup fails

# Apply to your DataFrame
cleaned_df_no_duplicates['domain_age'] = cleaned_df_no_duplicates['Domain'].apply(get_domain_age)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df_no_duplicates['domain_age'] = cleaned_df_no_duplicates['Domain'].apply(get_domain_age)


Check columns

In [77]:
cleaned_df_no_duplicates.head()

Unnamed: 0,url,status,URLLength,Domain,DomainLength,IsDomainIP,TLD,NoOfSubDomain,NoOfLettersInURL,LetterRatioInURL,...,NoOfURLRedirect,NoOfPopup,NoOfiFrame,HasSocialNet,length_words_raw,char_repeat,shortest_words_raw,longest_words_raw,avg_words_raw,domain_age
0,https://www.todayshomeowner.com/how-to-make-ho...,0.0,82.0,www.todayshomeowner.com,23.0,0.0,com,1.0,68.0,0.829268,...,0.0,0.0,0.0,0.0,13.0,9.0,2.0,15.0,5.666667,
1,http://thapthan.ac.th/information/confirmation...,1.0,93.0,thapthan.ac.th,14.0,0.0,th,1.0,59.0,0.634409,...,0.0,0.0,0.0,0.0,9.0,8.0,2.0,32.0,9.222222,
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,1.0,121.0,app.dialoginsight.com,21.0,0.0,com,1.0,55.0,0.454545,...,0.0,0.0,0.0,0.0,20.0,17.0,1.0,13.0,5.0,
3,https://www.bedslide.com,0.0,24.0,www.bedslide.com,16.0,0.0,com,1.0,19.0,0.791667,...,0.0,0.0,0.0,0.0,4.0,3.0,3.0,8.0,4.75,
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,0.0,73.0,tabs.ultimate-guitar.com,24.0,0.0,com,1.0,58.0,0.794521,...,0.0,0.0,0.0,0.0,9.0,9.0,1.0,22.0,7.0,


In [78]:
cleaned_df_no_duplicates['domain_age'].unique()

array([None], dtype=object)

#### Checking DNS Record

In [79]:
"""import socket

def check_dns_record(domain):
    try:
        socket.gethostbyname(domain)
        return 1  # DNS record exists
    except socket.gaierror:
        return 0  # No DNS record

# Apply to your DataFrame
cleaned_df_no_duplicates['dns_record'] = cleaned_df_no_duplicates['Domain'].apply(check_dns_record)
"""

"import socket\n\ndef check_dns_record(domain):\n    try:\n        socket.gethostbyname(domain)\n        return 1  # DNS record exists\n    except socket.gaierror:\n        return 0  # No DNS record\n\n# Apply to your DataFrame\ncleaned_df_no_duplicates['dns_record'] = cleaned_df_no_duplicates['Domain'].apply(check_dns_record)\n"

In [80]:
cleaned_df_no_duplicates.head()

Unnamed: 0,url,status,URLLength,Domain,DomainLength,IsDomainIP,TLD,NoOfSubDomain,NoOfLettersInURL,LetterRatioInURL,...,NoOfURLRedirect,NoOfPopup,NoOfiFrame,HasSocialNet,length_words_raw,char_repeat,shortest_words_raw,longest_words_raw,avg_words_raw,domain_age
0,https://www.todayshomeowner.com/how-to-make-ho...,0.0,82.0,www.todayshomeowner.com,23.0,0.0,com,1.0,68.0,0.829268,...,0.0,0.0,0.0,0.0,13.0,9.0,2.0,15.0,5.666667,
1,http://thapthan.ac.th/information/confirmation...,1.0,93.0,thapthan.ac.th,14.0,0.0,th,1.0,59.0,0.634409,...,0.0,0.0,0.0,0.0,9.0,8.0,2.0,32.0,9.222222,
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,1.0,121.0,app.dialoginsight.com,21.0,0.0,com,1.0,55.0,0.454545,...,0.0,0.0,0.0,0.0,20.0,17.0,1.0,13.0,5.0,
3,https://www.bedslide.com,0.0,24.0,www.bedslide.com,16.0,0.0,com,1.0,19.0,0.791667,...,0.0,0.0,0.0,0.0,4.0,3.0,3.0,8.0,4.75,
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,0.0,73.0,tabs.ultimate-guitar.com,24.0,0.0,com,1.0,58.0,0.794521,...,0.0,0.0,0.0,0.0,9.0,9.0,1.0,22.0,7.0,


HasFavicon, HasTitle, Title:
These require parsing the webpage’s HTML content, which can be done using the requests and BeautifulSoup libraries.

In [81]:
"""
import requests
from bs4 import BeautifulSoup

def get_page_content_features(url):
    features = {'HasTitle': 0, 'Title': None}
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Check for favicon
            #features['HasFavicon'] = int(bool(soup.find('link', rel='icon') or soup.find('link', rel='shortcut icon')))
            
            # Check for title
            title_tag = soup.find('title')
            if title_tag:
                features['HasTitle'] = 1
                features['Title'] = title_tag.get_text(strip=True)
    except:
        pass  # Handle exceptions like timeouts or connection issues

    return features

# Apply to DataFrame
page_content_features = combined_df_no_duplicates['url'].apply(get_page_content_features).apply(pd.Series)
cleaned_df_no_duplicates = pd.concat([cleaned_df_no_duplicates, page_content_features], axis=1)

"""


"\nimport requests\nfrom bs4 import BeautifulSoup\n\ndef get_page_content_features(url):\n    features = {'HasTitle': 0, 'Title': None}\n    try:\n        response = requests.get(url, timeout=5)\n        if response.status_code == 200:\n            soup = BeautifulSoup(response.content, 'html.parser')\n            \n            # Check for favicon\n            #features['HasFavicon'] = int(bool(soup.find('link', rel='icon') or soup.find('link', rel='shortcut icon')))\n            \n            # Check for title\n            title_tag = soup.find('title')\n            if title_tag:\n                features['HasTitle'] = 1\n                features['Title'] = title_tag.get_text(strip=True)\n    except:\n        pass  # Handle exceptions like timeouts or connection issues\n\n    return features\n\n# Apply to DataFrame\npage_content_features = combined_df_no_duplicates['url'].apply(get_page_content_features).apply(pd.Series)\ncleaned_df_no_duplicates = pd.concat([cleaned_df_no_duplicates,

#### Create a simple logistic regression model using the basic url and domain metrics features

In [87]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Selecting relevant features and target
features = ['URLLength', 'Domain', 'DomainLength', 'IsDomainIP', 'TLD', 'NoOfSubDomain']
X = cleaned_df_no_duplicates[features]
y = cleaned_df_no_duplicates['status']

# Step 2: Encoding categorical features
# Using ColumnTransformer to apply OneHotEncoder only to 'Domain' and 'TLD' as they are categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Domain', 'TLD'])
    ],
    remainder='passthrough'  # Keep other features as they are
)

# Step 3: Create a pipeline for preprocessing, standardization and logistic regression
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ("standard scaler", StandardScaler(with_mean=False)),
    ('logreg', LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', random_state=42))  
])

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model Training
pipeline.fit(X_train, y_train)

# Step 6: Model Prediction
y_pred = pipeline.predict(X_test)


# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.5694100450688213
Precision: 0.5728818546776804
Recall: 0.9684135725706164
F1 Score: 0.7198964633792028

Classification Report:
              precision    recall  f1-score   support

         0.0       0.47      0.04      0.07     21113
         1.0       0.57      0.97      0.72     28145

    accuracy                           0.57     49258
   macro avg       0.52      0.50      0.39     49258
weighted avg       0.53      0.57      0.44     49258

