In [1]:
import os
import numpy as np
import pandas as pd
from urllib import parse 
import re
regex = re.compile(r'(?P<protocol>[\w]+)\:\/\/(?P<host>[\w\.\-]+)(\:(?P<port>\d+))?((?P<path>\/[^\?\#\n]*)(\?(?P<params>(?P<first_param>[^\#\?\&\=\n]+(=[^\#\?\&\=\n]*)*)(\&(?P<param_more>[^\#\?\&\=\n]+(=[^\#\?\&\=\n]*)*))*))?(\#(?P<tag>[^\#\?\n]+)?)?)?$')


# get all data used for training 
class load_data:
    def __init__(self):
        dir_data = './RawData/FinalDataset/URL' #UNB dataset
        class_name = {'benign': 0, 'defacement': 1, 'malware': 2, 'phishing':3, 'spam':4}
        file_list = os.listdir(dir_data)
        data = []
        for cname, idx in class_name.items():
            if idx == 0: 
                continue
            for file in file_list:
                if cname in file.lower():
                    file_path = os.path.join(dir_data, file)
                    print('==== Class:{:>8s}'.format(cname))
                    df_data = self.prepare_data(file_path, idx)
                    data.append(self.get_regular_exp_url(df_data, regex))
        all_data = pd.concat(data)
        self.all_data = all_data.drop(['if_match'], axis=1)
        
    # trim data
    def prepare_data(self, dir_file, class_name):
        col_name = ['url', 'class']
        urls = pd.read_csv(dir_file, header=None).to_numpy()
        values = []
        for url in urls:
            url = url[0]
            #eliminate the unecessary character
            if "'" in url:
                url = url.replace("'", "")
            values.append([url, class_name])
        data = pd.DataFrame(values, columns=col_name) 
        data = data.replace("", np.nan)
        return data

    # extract URLs only following regular expression
    def get_regular_exp_url(self, df_url, regex):
        if_match = df_url['url'].apply(lambda x: self._check_regex(regex, x))
        if_match = pd.DataFrame({'if_match':if_match})
        data = pd.concat([df_url, if_match], axis=1)
        print('Delete {} of nan data from {} of raw data'.format( 
            data['if_match'].isna().sum(), data.shape[0]))
        data = data.dropna(axis=0)
        return data
    
    def _check_regex(self, regex, string):
        return regex.match(string)



# original data
load = load_data()
data_org = load.all_data
print('final data set: {}'.format(data_org.shape[0]))

print(data_org['url'].unique().shape[0])
data_org.head(10)

==== Class:defacement
Delete 114 of nan data from 96457 of raw data
==== Class: malware
Delete 252 of nan data from 11566 of raw data
==== Class:phishing
Delete 782 of nan data from 9965 of raw data
==== Class:    spam
Delete 1436 of nan data from 12000 of raw data
final data set: 127404
117314


Unnamed: 0,url,class
0,http://www.sinduscongoias.com.br/index.html,1
1,http://www.sinduscongoias.com.br/index.php/ins...,1
2,http://www.sinduscongoias.com.br/index.php/ins...,1
3,http://www.sinduscongoias.com.br/index.php/ins...,1
4,http://www.sinduscongoias.com.br/index.php/ins...,1
5,http://www.sinduscongoias.com.br/index.php/ins...,1
6,http://www.sinduscongoias.com.br/index.php/ins...,1
7,http://www.sinduscongoias.com.br/index.php/ins...,1
8,http://www.sinduscongoias.com.br/index.php/ins...,1
9,http://www.sinduscongoias.com.br/index.php/ins...,1


In [8]:
import math
import random

class data_augmentation:
    def __init__(self, data_org, n_add=1):
        class_labels = data_org['class'].unique()
        data_aug = list()
        for label in class_labels:
            self.data_current = data_org[ data_org['class'] == label ]
            print('\n==== class {}: {}'.format(label, self.data_current.shape[0]))
            data_frac = self.get_fraction(self.data_current)
            bag, freq = self.get_bag(data_frac)
            data_aug.append(self.augment_data(bag, freq, n_add, label))
        self.data_aug = pd.concat(data_aug) 
        
    def get_fraction(self, df_data):
        col_name = ['scheme', 'netloc', 'path', 'query', 'fragment', 'class']
        values = list()
        for url, url_class in zip(df_data['url'], df_data['class']):
            if '://' not in url:
                url = '//' + url
            if "'" in url:
                url = url.replace("'","")
            url_parsed = parse.urlsplit(url)
            values.append([url_parsed.scheme, url_parsed.netloc, 
                           url_parsed.path, url_parsed.query, 
                           url_parsed.fragment, url_class])
        data = pd.DataFrame(values, columns=col_name)
        data = data.replace("", np.nan)
        data = data.replace("/", np.nan)
        return data
    
    def get_bag(self, data_frac):
        bag = dict()
        bag['scheme'] = data_frac['scheme'].unique()
        bag['netloc'] = data_frac['netloc'].unique()
        bag['path'] = data_frac['path'].unique()
        query_keys, query_vals = self._get_key_val(data_frac['query'])
        bag['query_key'] = query_keys
        bag['query_val'] = query_vals
        bag['fragment'] = data_frac['fragment'].unique()
        
        freq = dict()
        for key in data_frac.keys():
            freq_frac = data_frac[key].notna().sum()/self.data_current.shape[0]
            print( key, freq_frac )
            freq[key] = freq_frac
        return bag, freq
    
    def augment_data(self, bag, freq, n_add, label):
        data_aug = list()
        for i in range(n_add):
            scheme = self._pick_one(bag['scheme'], freq['scheme'])
            #hostname = self._pick_one(bag['hostname'], 1.0)
            #port = self._pick_one(bag['port'], 0.0)
            netloc = self._pick_one(bag['netloc'], freq['netloc'])
            path = self._pick_one(bag['path'], freq['path'])
            n_query = 3    #random.randrange(0,5)
            query = dict()
            for i in range(n_query):
                query_key = self._pick_one(bag['query_key'], freq['query'])
                if query_key is not 'None':
                    #query_val = self._pick_one(bag['query_val'], freq['query']) 
                    query_val = self._pick_one(bag['query_val'], 0.99) 
                    query[query_key] = query_val
            fragment = self._pick_one(bag['fragment'], freq['fragment'])
            
            url_parsed = parse.urlsplit('')
            url_new = url_parsed._replace(scheme=self._quote(scheme), 
                                          netloc=self._quote(netloc), 
                                          path=self._quote(path),  
                                          query=self._quote_query(query),
                                          fragment=self._quote(fragment))
            url_new = parse.urlunsplit(url_new)
            
            if not query:
                url_new = url_new.replace('?','')
            if fragment is 'None':
                url_new = url_new.replace('#','')
            
            data_aug.append([url_new, label])
        data_aug = pd.DataFrame(data_aug, columns=['url','class'])
        #print(data_aug[ data_aug.duplicated(['url']) ])
        data_aug = data_aug.drop_duplicates(subset=['url'])
        return data_aug

    def _get_key_val(self, df_target):    
        keys = list()
        vals = list()
        for query in df_target.to_numpy():
            if query is not np.nan:
                for (key, val) in parse.parse_qsl(query):
                    keys.append(key)
                    vals.append(val)
        keys = np.unique(keys, return_counts=False)
        vals = np.unique(vals, return_counts=False)
        return keys, vals
    
    def _pick_one(self, component, frequency):
               
        if random.random() <= frequency :
            return component[ random.randrange(0,len(component)) ]
        else:
            return 'None'
    
    def _quote_query(self, d):
        query = dict()
        for key in d.keys():
            query[key] = str(d[key]).replace('None','')          
        return parse.urlencode(query, doseq=True)

    def _quote(self, string):
        return parse.quote(str(string).replace('None',''))


# augmented data
n_add = 10000 # for each class
augment_data = data_augmentation(data_org, n_add=n_add)
data_aug = augment_data.data_aug
print('\n==== augmented data:', data_aug.shape[0])
data_aug.head(10)



==== class 1: 96343
scheme 1.0
netloc 1.0
path 1.0
query 0.569299274467268
fragment 1.0379581287690854e-05
class 1.0

==== class 2: 11314
scheme 1.0
netloc 1.0
path 0.9980555064521831
query 0.3136821636910023
fragment 0.0
class 1.0

==== class 3: 9183
scheme 1.0
netloc 1.0
path 0.9603615376238702
query 0.1195687683763476
fragment 0.002069040618534248
class 1.0

==== class 4: 10564
scheme 1.0
netloc 1.0
path 0.9995266944339265
query 0.48182506626277927
fragment 0.0
class 1.0

==== augmented data: 39997


Unnamed: 0,url,class
0,http://www.prosuzdal.ru/Latest/store-locations...,1
1,http://josedeitx.com/component/content/article...,1
2,http://www.caseaffittipuglia.com/it/fotogaller...,1
3,http://onlineigri.net/all-games/armored-fighte...,1
4,http://www.leine-net.de/where-we-work/zambia/1...,1
5,http://www.centromutuitoscana.it/nl/scandia/11...,1
6,http://tanzaniawerkgroeptilburg.nl/linkpartner...,1
7,http://www.heide-marys-windelgesindel.de/templ...,1
8,http://helptheorphans.net/scopri-ciardo.html?v...,1
9,http://www.ondrejov.cz/services/freight/virusp...,1


In [16]:
# split data: train / valid / test
import os

def split_train_test(data, ratio):
    data = data.to_numpy()
    n_train = math.ceil(len(data)*ratio[0])
    n_valid = math.ceil(len(data)*ratio[1])
    indx_shuffle = [ x for x in range(data.shape[0]) ]
    random.shuffle(indx_shuffle)
    shuffled = data[indx_shuffle]
    train = shuffled[:n_train, :]
    if n_valid > 0: 
        valid = shuffled[n_train:n_train+n_valid, :]    
    else:
        valid = np.empty((n_valid,2))
    test = shuffled[n_train+n_valid:, :]
    return train, valid, test

ratio = [0.99, 0.0, 0.01]
fname = 'mali'
dir_save = './train_data'
if not os.path.isdir(dir_save):
    os.mkdir(dir_save)
    
for is_augment in [False, True]:
    if is_augment: 
        fname += '_augment'
        all_data = pd.concat([data_org, data_aug])
    else:
        all_data = data_org
    
    all_data = all_data.drop_duplicates(subset=['url'])
    train, valid, test = split_train_test(all_data, ratio)

    df_train = pd.DataFrame(train, columns=['url','label'])
    df_valid = pd.DataFrame(valid, columns=['url','label'])
    df_test = pd.DataFrame(test, columns=['url','label'])

    df_train.to_csv(os.path.join(dir_save,'{}.txt'.format(fname)), columns=['url'], header=None, index=False)
    df_valid.to_csv(os.path.join(dir_save,'{}_valid.txt'.format(fname)), columns=['url'], header=None, index=False)
    df_test.to_csv(os.path.join(dir_save,'{}_test.txt'.format(fname)), columns=['url'], header=None, index=False)

    print('==== dataset {}'.format(fname))
    print('train: {}'.format(df_train.shape[0]))
    print('valid: {}'.format(df_valid.shape[0]))
    print('test : {}'.format(df_test.shape[0]))

class_name = {'benign': 0, 'defacement': 1, 'malware': 2, 'phishing':3, 'spam':4}
for cname, label in class_name.items():
    fname = cname
    data_org_current = data_org[ data_org['class'] == label ]
    data_aug_current = data_aug[ data_aug['class'] == label ]
    for is_augment in [False, True]:
        if is_augment: 
            fname += '_augment'
            all_data = pd.concat([data_org_current, data_aug_current])
        else:
            all_data = data_org_current

        all_data = all_data.drop_duplicates(subset=['url'])
        train, valid, test = split_train_test(all_data, ratio)

        df_train = pd.DataFrame(train, columns=['url','label'])
        df_valid = pd.DataFrame(valid, columns=['url','label'])
        df_test = pd.DataFrame(test, columns=['url','label'])

        df_train.to_csv(os.path.join(dir_save,'{}.txt'.format(fname)), columns=['url'], header=None, index=False)
        df_valid.to_csv(os.path.join(dir_save,'{}_valid.txt'.format(fname)), columns=['url'], header=None, index=False)
        df_test.to_csv(os.path.join(dir_save,'{}_test.txt'.format(fname)), columns=['url'], header=None, index=False)

        print('==== dataset {}'.format(fname))
        print('train: {}'.format(df_train.shape[0]))
        print('valid: {}'.format(df_valid.shape[0]))
        print('test : {}'.format(df_test.shape[0]))


==== dataset mali
train: 116141
valid: 0
test : 1173
==== dataset mali_augment
train: 155727
valid: 0
test : 1572
==== dataset benign
train: 0
valid: 0
test : 0
==== dataset benign_augment
train: 0
valid: 0
test : 0
==== dataset defacement
train: 94242
valid: 0
test : 951
==== dataset defacement_augment
train: 104142
valid: 0
test : 1051
==== dataset malware
train: 2438
valid: 0
test : 24
==== dataset malware_augment
train: 12333
valid: 0
test : 124
==== dataset phishing
train: 9083
valid: 0
test : 91
==== dataset phishing_augment
train: 18981
valid: 0
test : 191
==== dataset spam
train: 10381
valid: 0
test : 104
==== dataset spam_augment
train: 20273
valid: 0
test : 204
