In [55]:
import os
import numpy as np
import pandas as pd
from urllib import parse 
import re
regex = re.compile(r'(?P<protocol>[\w]+)\:\/\/(?P<host>[\w\.\-]+)(\:(?P<port>\d+))?((?P<path>\/[^\?\#\n]*)(\?(?P<params>(?P<first_param>[^\#\?\&\=\n]+(=[^\#\?\&\=\n]*)*)(\&(?P<param_more>[^\#\?\&\=\n]+(=[^\#\?\&\=\n]*)*))*))?(\#(?P<tag>[^\#\?\n]+)?)?)?$')


# get all data used for training 
class load_data:
    def __init__(self):
        dir_data = './RawData/FinalDataset/URL' #UNB dataset
        #class_name = ['benign', 'defacement', 'malware', 'phishing', 'spam']
        class_name = ['benign']
        file_list = os.listdir(dir_data)
        data = []
        for idx, cname in enumerate(class_name):
            for file in file_list:
                if cname in file.lower():
                    file_path = os.path.join(dir_data, file)
                    print('==== Class:{:>8s}'.format(cname))
                    df_data = self.prepare_data(file_path, idx)
                    data.append(self.get_regular_exp_url(df_data, regex))
        all_data = pd.concat(data)
        self.all_data = all_data.drop(['if_match'], axis=1)
        
    # trim data
    def prepare_data(self, dir_file, class_name):
        col_name = ['url', 'class']
        urls = pd.read_csv(dir_file, header=None).to_numpy()
        values = []
        for url in urls:
            url = url[0]
            #eliminate the unecessary character
            if "'" in url:
                url = url.replace("'", "")
            values.append([url, class_name])
        data = pd.DataFrame(values, columns=col_name) 
        data = data.replace("", np.nan)
        return data

    # extract URLs only following regular expression
    def get_regular_exp_url(self, df_url, regex):
        if_match = df_url['url'].apply(lambda x: self._check_regex(regex, x))
        if_match = pd.DataFrame({'if_match':if_match})
        data = pd.concat([df_url, if_match], axis=1)
        print('Delete {} of nan data from {} of raw data'.format( 
            data['if_match'].isna().sum(), data.shape[0]))
        data = data.dropna(axis=0)
        return data
    
    def _check_regex(self, regex, string):
        return regex.match(string)



# original data
load = load_data()
data_org = load.all_data
print('final data set: {}'.format(data_org.shape[0]))
data_org.head(5)
print(data_org['class'].unique())
data_org[ data_org['class'] == 0].head(5)
data_org['url'].unique().shape[0]

==== Class:  benign
Delete 214 of nan data from 35378 of raw data
final data set: 35164
[0]


35164

In [71]:
import math
import random

class data_augmentation:
    def __init__(self, data_org, n_add=1):
        class_labels = data_org['class'].unique()
        data_aug = list()
        for label in class_labels:
            self.data_current = data_org[ data_org['class'] == label ]
            data_frac = self.get_fraction(self.data_current)
            bag, freq = self.get_bag(data_frac)
            data_aug.append(self.augment_data(bag, freq, n_add, label))
        self.data_aug = pd.concat(data_aug) 
        
    def get_fraction(self, df_data):
        col_name = ['scheme', 'netloc', 'path', 'params', 'query', 'fragment', 'class']
        values = list()
        for url, url_class in zip(df_data['url'], df_data['class']):
            if '://' not in url:
                url = '//' + url
            if "'" in url:
                url = url.replace("'","")
            url_parsed = parse.urlparse(url)
            values.append([url_parsed.scheme, url_parsed.netloc, 
                           url_parsed.path, url_parsed.params, url_parsed.query, 
                           url_parsed.fragment, url_class])
        data = pd.DataFrame(values, columns=col_name)
        data = data.replace("", np.nan)
        data = data.replace("/", np.nan)
        return data
    
    def get_bag(self, data_frac):
        bag = dict()
        bag['scheme'] = data_frac['scheme'].unique()
        bag['netloc'] = data_frac['netloc'].unique()
        bag['path'] = data_frac['path'].unique()
        param_keys, param_vals = self._get_key_val(data_frac['params'])
        query_keys, query_vals = self._get_key_val(data_frac['query'])
        bag['param_key'] = param_keys
        bag['param_val'] = param_vals
        bag['query_key'] = query_keys
        bag['query_val'] = query_vals
        bag['fragment'] = data_frac['fragment'].unique()
        
        freq = dict()
        for key in data_frac.keys():
            freq_frac = data_frac[key].notna().sum()/self.data_current.shape[0]
            print( key, freq_frac )
            freq[key] = freq_frac
        return bag, freq
    
    def augment_data(self, bag, freq, n_add, label):
        data_aug = list()
        for i in range(n_add):
            scheme = self._pick_one(bag['scheme'], freq['scheme'])
            #hostname = self._pick_one(bag['hostname'], 1.0)
            #port = self._pick_one(bag['port'], 0.0)
            netloc = self._pick_one(bag['netloc'], freq['netloc'])
            path = self._pick_one(bag['path'], freq['path'])
            
            n_param = 3    #random.randrange(0,5)
            n_query = 3    #random.randrange(0,5)
            param = dict()
            query = dict()
            for i in range(n_param):
                param_key = self._pick_one(bag['param_key'], freq['params'])
                if param_key is not 'None':
                    #param_val = self._pick_one(bag['param_val'], freq['params'])
                    param_val = self._pick_one(bag['param_val'], 0.99)
                    param[param_key] = param_val

            for i in range(n_query):
                query_key = self._pick_one(bag['query_key'], freq['query'])
                if query_key is not 'None':
                    #query_val = self._pick_one(bag['query_val'], freq['query']) 
                    query_val = self._pick_one(bag['query_val'], 0.99) 
                    query[query_key] = query_val
            fragment = self._pick_one(bag['fragment'], freq['fragment'])
            
            url_parsed = parse.urlparse('')
            url_new = url_parsed._replace(scheme=self._quote(scheme), 
                                          netloc=self._quote(netloc), 
                                          path=self._quote(path), 
                                          params=self._quote_query(param), 
                                          query=self._quote_query(query),
                                          fragment=self._quote(fragment))
            url_new = parse.urlunparse(url_new)
            if not param:
                url_new = url_new.replace(';','')
            if not query:
                url_new = url_new.replace('?','')
            if fragment is 'None':
                url_new = url_new.replace('#','')
            
            data_aug.append([url_new, label])
        data_aug = pd.DataFrame(data_aug, columns=['url','class'])
        print(data_aug[ data_aug.duplicated(['url']) ])
        data_aug = data_aug.drop_duplicates(subset=['url'])
        return data_aug

    def _get_key_val(self, df_target):    
        keys = list()
        vals = list()
        for query in df_target.to_numpy():
            if query is not np.nan:
                for (key, val) in parse.parse_qsl(query):
                    keys.append(key)
                    vals.append(val)
        keys = np.unique(keys, return_counts=False)
        vals = np.unique(vals, return_counts=False)
        return keys, vals
    
    def _pick_one(self, component, frequency):
               
        if random.random() <= frequency :
            return component[ random.randrange(0,len(component)) ]
        else:
            return 'None'
    
    def _quote_query(self, d):
        query = dict()
        for key in d.keys():
            query[key] = str(d[key]).replace('None','')          
        return parse.urlencode(query, doseq=True)

    def _quote(self, string):
        return parse.quote(str(string).replace('None',''))


# augmented data
n_add = 20000
augment_data = data_augmentation(data_org, n_add=n_add)
data_aug = augment_data.data_aug
data_aug['url']


scheme 1.0
netloc 1.0
path 0.9965589807757934
params 8.531452621999773e-05
query 0.26905357752246617
fragment 0.0
class 1.0
                                                     url  class
15581  https://udn.com/Content/AP-In-The-News/2014/AP...      0


0        http://io9.com/movies/4576769/jennifer_aniston...
1        http://pornsharing.com/magazine/wissen/blog/br...
2        https://bdnews24.com/2015/05/12/crystal-palace...
3        http://infospace.com/shop/g/men/watch/the-watc...
4        http://videomega.tv/jokes/all/page/1/hero/0/ca...
                               ...                        
19995    https://quizlet.com/post/related/id/554cd79e4a...
19996    http://techcrunch.com/torrent/4189296/Sia%2B%2...
19997    http://pikabu.ru/questions/9041152/sql-sum-and...
19998    http://creativemarket.com/la-cool/nhung-tac-ph...
19999    https://abcnews.go.com/2015/04/15/definitely-c...
Name: url, Length: 19999, dtype: object

In [78]:
# split data: train / valid / test
import os

dir_save = './augmented'
if not os.path.isdir(dir_save):
    os.mkdir(dir_save)

all_data = pd.concat([data_org, data_aug])
all_data = all_data.drop_duplicates(subset=['url'])

def split_train_test(data, ratio):
    data = data.to_numpy()
    n_train = math.ceil(len(data)*ratio[0])
    n_valid = math.ceil(len(data)*ratio[1])
    indx_shuffle = [ x for x in range(data.shape[0]) ]
    random.shuffle(indx_shuffle)
    shuffled = data[indx_shuffle]
    train = shuffled[:n_train, :]
    if n_valid > 0: 
        valid = shuffled[n_train:n_train+n_valid, :]    
    else:
        valid = np.empty((n_valid,2))
    test = shuffled[n_train+n_valid:, :]
    return train, valid, test

ratio = [0.99, 0.0, 0.01]
train, valid, test = split_train_test(all_data, ratio)


df_train = pd.DataFrame(train, columns=['url','label'])
df_valid = pd.DataFrame(valid, columns=['url','label'])
df_test = pd.DataFrame(test, columns=['url','label'])

df_train.to_csv(os.path.join(dir_save,'benign_augmented.txt'), columns=['url'], header=None, index=False)
df_valid.to_csv(os.path.join(dir_save,'benign_augmented_valid.txt'), columns=['url'], header=None, index=False)
df_test.to_csv(os.path.join(dir_save,'benign_augmented_test.txt'), columns=['url'], header=None, index=False)

print('==== dataset {}'.format('original+augmented'))
print('train: {}'.format(df_train.shape[0]))
print('valid: {}'.format(df_valid.shape[0]))
print('test : {}'.format(df_test.shape[0]))



set_ratio = [0.2, 0.1]
set_name = ['small', 'mini']

for (ratio, sname) in zip(set_ratio, set_name):
    n_small_train = math.ceil(len(train)*ratio)
    n_small_valid = math.ceil(len(valid)*ratio)
    n_small_test = math.ceil(len(test)*ratio)
    df_train_small = df_train.head(n_small_train)
    df_valid_small = df_valid.head(n_small_valid)
    df_test_small = df_test.head(n_small_test)
    
    df_train_small.to_csv(os.path.join(dir_save,'benign_augmented_{}.txt'.format(sname)), columns=['url'], header=None, index=False)
    df_valid_small.to_csv(os.path.join(dir_save,'benign_augmented_{}_valid.txt'.format(sname)), columns=['url'], header=None, index=False)
    df_test_small.to_csv(os.path.join(dir_save,'benign_augmented_{}_test.txt'.format(sname)), columns=['url'], header=None, index=False)
    
    print('==== dataset {}'.format(sname))
    print('train: {}'.format(df_train_small.shape[0]))
    print('valid: {}'.format(df_valid_small.shape[0]))
    print('test : {}'.format(df_test_small.shape[0]))


==== dataset original+augmented
train: 54606
valid: 0
test : 551
==== dataset small
train: 10922
valid: 0
test : 111
==== dataset mini
train: 5461
valid: 0
test : 56
