In [1]:
import pandas as pd
import requests
from datetime import datetime
from tqdm import tqdm
import gc

In [2]:
#loading the phishing URLs data to dataframe
data0 = pd.read_csv("online-valid.csv")

#Collecting 5,000 Phishing URLs randomly
phishurl = data0.sample(n = 5000, random_state = 5).copy()
phishurl = phishurl.reset_index(drop=True)

In [3]:
#Loading legitimate files 
data1 = pd.read_csv("Benign_list_big_final.csv")
data1.columns = ['URLs']

#Collecting 5,000 Legitimate URLs randomly
data3 = data1.sample(n = 5000, random_state = 5).copy()

#load 2000 self URLs
data2 = pd.read_csv("made_by_self.csv")
data2.columns = ['URLs']

#Get 7,000 Legitimate URLs randomly
merged_data = pd.concat([data2, data3])
legiurl = merged_data.sample(frac=1, random_state=5).reset_index(drop=True)

In [4]:
def n_AtSign(url):
    at =  url.count('@') 
    return at

def n_Length(url):
    length = len(url)           
    return length

def n_dots(url):
    dot = url.count('.')
    return dot

def n_hypens(url):
    hypen = url.count('-')
    return hypen

def n_underline(url):
    underline = url.count('_')
    return underline

def n_slash(url):
    slash = url.count('/')
    return slash

def n_questionmark(url):
    questionmark = url.count('?')
    return questionmark

def n_equal(url):
    equal = url.count('=')
    return equal

def n_and(url):
    ands = url.count('&')
    return ands

def n_exclamation(url):
    exclamation = url.count('!')
    return exclamation

def n_space(url):
    space = url.count(' ')
    return space

def n_tilde(url):
    tilde = url.count('~')
    return tilde

def n_comma(url):
    comma = url.count(',')
    return comma

def n_plus(url):
    plus = url.count('+')
    return plus

def n_asterisk(url):
    asterisk = url.count('*')
    return asterisk

def n_hastag(url):
    hastag = url.count('#')
    return hastag

def n_dollar(url):
    dollar = url.count('$')
    return dollar

def n_percent(url):
    percent = url.count('%')
    return percent

In [5]:
def n_redirection(url, max_redirects=6):
    try:
        session = requests.Session()
        response = session.get(url, allow_redirects=True, timeout=10)
        # 检查重定向次数是否超过了max_redirects
        redirection_count = len(response.history)
        if redirection_count > max_redirects:
            return max_redirects
        else:
            return redirection_count
    except requests.TooManyRedirects:
        return max_redirects
    except requests.RequestException:
        return 0
    finally:
        session.close()

In [6]:
def extract_features(url):
    return [
        n_Length(url), n_dots(url), n_hypens(url), n_underline(url),
        n_slash(url), n_questionmark(url), n_equal(url), n_AtSign(url),
        n_and(url), n_exclamation(url), n_space(url), n_tilde(url),
        n_comma(url), n_plus(url), n_asterisk(url), n_hastag(url),
        n_dollar(url), n_percent(url), n_redirection(url)
    ]

In [7]:
#Extracting the feautres & storing them in a list
legi_features = []
label0 = 0
# use tqdm to show the progress
for i in tqdm(range(0, 7000), desc="Extracting features"):
    url = legiurl['URLs'][i]
    features = extract_features(url)
    features.append(label0)  # add label
    legi_features.append(features)
    # Free space after every 100 URLs processed
    if i % 100 == 0:
        gc.collect()

Extracting features: 100%|██████████| 7000/7000 [2:10:56<00:00,  1.12s/it]  


In [8]:
#converting the list to dataframe
feature_names = ['n_Length', 'n_dots', 'n_hypens','n_underline', 'n_slash','n_questionmark',
                 'n_equal','n_AtSign','n_and','n_exclamation','n_space','n_tilde','n_comma',
                 'n_plus','n_asterisk','n_hastag','n_dollar','n_percent','n_redirection','label']

legitimate = pd.DataFrame(legi_features, columns = feature_names)
legitimate.head()

Unnamed: 0,n_Length,n_dots,n_hypens,n_underline,n_slash,n_questionmark,n_equal,n_AtSign,n_and,n_exclamation,n_space,n_tilde,n_comma,n_plus,n_asterisk,n_hastag,n_dollar,n_percent,n_redirection,label
0,91,2,1,1,5,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0
1,96,2,0,0,5,1,6,0,5,0,0,0,0,0,0,0,0,3,1,0
2,75,3,5,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,69,2,2,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,86,1,11,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [9]:
# Storing the extracted legitimate URLs fatures to csv file
legitimate.to_csv('Final_legitimate_data.csv', index= False)

In [10]:
#Extracting the feautres & storing them in a list
phish_features = []
label1 = 1

for i in tqdm(range(0, 5000), desc="Extracting features"):
  url = phishurl['url'][i]
  features = extract_features(url)
  features.append(label1)
  phish_features.append(features)
  # Free space after every 100 URLs processed
  if i % 100 == 0:
    gc.collect()

Extracting features: 100%|██████████| 5000/5000 [1:57:22<00:00,  1.41s/it]  


In [11]:
#converting the list to dataframe
feature_names = ['n_Length', 'n_dots', 'n_hypens','n_underline', 'n_slash','n_questionmark',
                 'n_equal','n_AtSign','n_and','n_exclamation','n_space','n_tilde','n_comma',
                 'n_plus','n_asterisk','n_hastag','n_dollar','n_percent','n_redirection','label']

phishing = pd.DataFrame(phish_features, columns= feature_names)
phishing.head()

Unnamed: 0,n_Length,n_dots,n_hypens,n_underline,n_slash,n_questionmark,n_equal,n_AtSign,n_and,n_exclamation,n_space,n_tilde,n_comma,n_plus,n_asterisk,n_hastag,n_dollar,n_percent,n_redirection,label
0,64,2,0,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,123,2,2,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,38,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,45,1,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,316,3,5,4,7,1,5,0,4,0,0,0,0,0,0,0,0,8,0,1


In [12]:
# Storing the extracted legitimate URLs fatures to csv file
phishing.to_csv('phishing_data.csv', index= False)

In [13]:
d1 = pd.read_csv("Final_legitimate_data.csv")
d2 = pd.read_csv("phishing_data.csv")

In [14]:
#Concatenating the dataframes into one 
urldata = pd.concat([d1, d2]).reset_index(drop=True)
urldata.shape

(12000, 20)

In [15]:
# Storing the data in CSV file
urldata.to_csv('combine_test_data.csv', index=False)