In [59]:
import pandas as pd
from urllib.parse import urlparse
import os.path
import re

# Importing Data

In [60]:
df = pd.read_csv("urldata_raw.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0
5,5,https://www.reddit.com,benign,0
6,6,https://www.yahoo.com,benign,0
7,7,https://www.google.co.in,benign,0
8,8,https://www.qq.com,benign,0
9,9,https://www.amazon.com,benign,0


In [61]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [62]:
df

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0
...,...,...,...
450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1
450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1
450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1
450174,http://atualizapj.com/,malicious,1


In [63]:
df['label'].value_counts()

label
benign       345738
malicious    104438
Name: count, dtype: int64

# Feature Extraction

## 1. Lengths
- URL length
- Hostname length
- Path length
- First Directory length

## 2. Feature count
- '-' 
- '@' 
- '?'
- '%' 
- '.' 
- '=' 
- 'http' 
- 'https' 
- 'www' 
- Digits
- Letters 
- Directory count

## 3. Binary features - 
- Usage of IP in URL
- Usage of URL shortening


In [64]:
# Lengths

# URL length
df['url_length'] = df['url'].apply(lambda i : len(i))

# Hostname length
df['hostname_length'] = df['url'].apply(lambda i : len(urlparse(i).netloc))

# Path length
df['path_length'] = df['url'].apply(lambda i : len(urlparse(i).path))

# First directory length
def first_directory_length(url:str):
    urlpath:str = urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0
df['first_directory_length'] = df['url'].apply(lambda i : first_directory_length(i))

In [72]:
# Feature Count

df['count:-'] = df["url"].apply(lambda url: url.count('-'))
df['count:@'] = df["url"].apply(lambda url: url.count('@'))
df['count:?'] = df["url"].apply(lambda url: url.count('?'))
df['count:%'] = df["url"].apply(lambda url: url.count('%'))
df['count:.'] = df["url"].apply(lambda url: url.count('.'))
df['count:='] = df["url"].apply(lambda url: url.count('='))

# Digit Count
df['count:digits'] = df["url"].apply(lambda url : len([i for i in url if i.isnumeric()]))

# Letter Count
df['count:letters'] = df["url"].apply(lambda url: len([i for i in url if i.isalpha()]))

# Directory Count
df['count:directories'] = df["url"].apply(lambda url : urlparse(url=url).path.count('/'))

# Case Change Count
def count_case_change(input_string):
  switch_count = 0
  prev_case = None
  for char in input_string:
    if char.isalpha() and (char.isupper() == (not prev_case)):
      switch_count += 1
      prev_case = char.isupper() if char.isalpha() else None
  return switch_count
df['count:casechanges'] = df['url'].apply(lambda url : count_case_change(url))

In [73]:
df

Unnamed: 0,url,label,result,url_length,hostname_length,path_length,first_directory_length,count:-,count:@,count:?,count:%,count:.,count:=,count:digits,count:letters,count:directories,count:casechanges
0,https://www.google.com,benign,0,22,14,0,0,0,0,0,0,2,0,0,17,0,0
1,https://www.youtube.com,benign,0,23,15,0,0,0,0,0,0,2,0,0,18,0,0
2,https://www.facebook.com,benign,0,24,16,0,0,0,0,0,0,2,0,0,19,0,0
3,https://www.baidu.com,benign,0,21,13,0,0,0,0,0,0,2,0,0,16,0,0
4,https://www.wikipedia.org,benign,0,25,17,0,0,0,0,0,0,2,0,0,20,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1,43,11,25,8,1,0,0,0,2,0,0,34,3,0
450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,159,13,139,2,0,0,0,0,2,1,21,118,12,10
450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,147,13,127,2,0,0,0,0,1,1,20,109,12,10
450174,http://atualizapj.com/,malicious,1,22,14,1,0,0,0,0,0,1,0,0,17,1,0
