# URL Data merging
we had many datasets for **malicious** and **benign** urls and we had to merge and preprocess them.
Urls are parsed with url parser and then with class **URLPreprocessor** we exctract certain features from them such as: *length*, *top level domains(tld)*, *ratio* between lengths of different parts, *length* of *domain name*, *parameters*, *query* and also *count* of different parts. **Entropy** is based on formula taken from research paper and is based on probability numbers of letters which were calculated from the Alexa Top
1 million domains. Then every dataset file is merged into one big dataset, broken down into parts for speed and preprocessed.  

In [2]:
import re
import csv
import os
import math
import pandas as pd
import urllib.parse
from __future__ import division
from collections import Counter
from tld import get_tld
def urlparse(address):
    if not re.search(r'^[A-Za-z0-9+.\-]+://', address):
        address = 'http://{0}'.format(address)
    return urllib.parse.urlparse(address)



In [3]:
class URLPreprocessor:
    
    def __init__(self, url,benign):
        url = url.replace(r"'",'')
        self.url = url
        self.benign = benign
        self.parsedurl = urlparse(url)
        self.tld = get_tld(url, fail_silently=True,fix_protocol=True) 
        self.domain = self.parsedurl.netloc
        self.protocol = self.parsedurl.scheme
        self.path = self.parsedurl.path
        self.parameters = self.parsedurl.query
        self.anchor = self.parsedurl.fragment

        self.path_url_ratio = len(self.path) / len(self.url)
        self.params_domain_ratio = len(self.parameters) / len(self.domain)
        self.params_url_ratio = len(self.parameters) / len(self.url)  
        self.domain_url_ratio = len(self.domain) / len(self.url)
        self.number_rate_url = len(re.sub("[^0-9]", "", self.url)) / len(self.url)
        self.path_domain_ratio = len(self.path) / len(self.domain)
        self.avg_path_token_len = sum([len(i) for i in self.path.split("/")]) / len(self.path.split("/"))
        
        #simboloebis counts vamateb imitoro magasac aqvs mnishvneloba turme
        # @, //, ., % da - s counts vamateb da kide vamateb zogadad simboloebis counts
        self.at_count = url.count(r'@')
        self.double_slash_count = url.count(r'//')
        self.hyphen_count = url.count(r'-')
        self.dot_count = url.count(r'.')
        self.percent_count = url.count(r'%')
        self.symbols_count = len(re.findall(r'[:/=?.,;()]+',url))
        #vamateb top level domainebis counts
        try:
            self.tld_count = len(self.tld.split('.'))
        except:
            self.tld_count = 0
        #vamateb parametrebshi/queryshi da domainshi digit counts
        self.param_digit_count = len(re.findall(r'\d', self.parameters))
        self.domain_digit_count = len(re.findall(r'\d', self.domain))
        
        try:
            self.number_rate_after_path = len(re.sub("[^0-9]", "", self.url.split("/")[-1])) / self.url.split("/")[-1]
        except: 
            self.number_rate_after_path = 0
        
        try:
            self.params_path_ratio = len(self.parameters) / len(self.path)
        except:
            self.params_path_ratio = 0

        try:
            self.number_rate_file_name = len(re.sub("[^0-9]", "", self.path)) / len(self.path)
        except: 
            self.number_rate_file_name = 0

        

    
    
    
    def char_cont_rate(self):
        symbol_sequences = re.findall(r'[^a-zA-Z0-9]', self.domain)
        total_len = len(self.domain)
    
        current_char_sequence = 0
        longest_char_sequence = 0

        current_digit_sequence = 0
        longest_digit_sequence = 0
        
        current_sym_sequence = 0
        longest_sym_sequence = 0

        for char in self.domain:
            if char.isalpha():
                current_char_sequence += 1
                current_digit_sequence = 0  # Reset digit sequence count
            elif char.isdigit():
                current_digit_sequence += 1
                current_char_sequence = 0  # Reset character sequence count
            else:
                current_char_sequence = 0
                current_digit_sequence = 0
                
            if char in symbol_sequences:
                current_sym_sequence +=1
            else:
                longest_sym_sequence = max(longest_sym_sequence,current_sym_sequence)
                current_sym_sequence = 0
            
            longest_char_sequence = max(longest_char_sequence, current_char_sequence)
            longest_digit_sequence = max(longest_digit_sequence, current_digit_sequence)
            
        return sum([longest_char_sequence,longest_digit_sequence,longest_sym_sequence])/total_len        
    

    def entropy(self):
        
        base = 2
        data = self.domain
        entropy = 0.0
        #remove dots and tld
        try:
            data = data.replace(self.tld,'')
        except:
            pass
        data = data.replace('.','').replace(":",'').replace(r'%','')
        data = data.lower()
        length = len(data) * 1.0

        if length > 0:
            cnt = Counter(data)
            # These probability numbers were calculated from the Alexa Top
            # 1 million domains as of September 15th, 2017. TLDs and instances
            # of 'www' were removed so 'www.google.com' would be treated as
            # 'google' and 'images.google.com' would be 'images.google'.
            probabilities = {
                '-': 0.013342298553905901,
                #arvici @ simbolos probability da yvelaze dabals mivanicheb imitoro saertod arunda iyos
                '@': 9.04562613824129e-08,
                '_': 9.04562613824129e-06,
                '0': 0.0024875471880163543,
                '1': 0.004884638114650296,
                '2': 0.004373560237839663,
                '3': 0.0021136613076357144,
                '4': 0.001625197496170685,
                '5': 0.0013070929769758662,
                '6': 0.0014880054997406921,
                '7': 0.001471421851820583,
                '8': 0.0012663876593537805,
                '9': 0.0010327089841158806,
                'a': 0.07333590631143488,
                'b': 0.04293204925644953,
                'c': 0.027385633133525503,
                'd': 0.02769469202658208,
                'e': 0.07086192756262588,
                'f': 0.01249653250998034,
                'g': 0.038516276096631406,
                'h': 0.024017645001386995,
                'i': 0.060447396668797414,
                'j': 0.007082725266242929,
                'k': 0.01659570875496002,
                'l': 0.05815885325582237,
                'm': 0.033884915513851865,
                'n': 0.04753175014774523,
                'o': 0.09413783122067709,
                'p': 0.042555148167356144,
                'q': 0.0017231917793349655,
                'r': 0.06460084667060655,
                's': 0.07214640647425614,
                't': 0.06447722311338391,
                'u': 0.034792493336388744,
                'v': 0.011637198026847418,
                'w': 0.013318176884203925,
                'x': 0.003170491961453572,
                'y': 0.016381628936354975,
                'z': 0.004715786426736459
            }

            for char, count in cnt.items():
                observed = count / length
                expected = probabilities[char]
                entropy += observed * math.log((observed / expected), base)
        return entropy

        

    def process(self):
        entropy = self.entropy()
        char_cont_rate = self.char_cont_rate()
        return {
            "url":self.url,
            "benign":self.benign,
            "number_rate_file_name" : self.number_rate_file_name,
            "domain_url_ratio" : self.domain_url_ratio,
            "number_rate_url" : self.number_rate_url,
            "path_domain_ratio" : self.path_domain_ratio,
            "number_rate_after_path" : self.number_rate_after_path,
            "avg_path_token_len" : self.avg_path_token_len,
            "params_path_ratio" : self.params_path_ratio,
            "params_url_ratio" : self.params_url_ratio,
            "params_domain_ratio" : self.params_domain_ratio,
            "path_url_ratio" : self.path_url_ratio,
            "entropy" : entropy,
            "char_cont_rate" : char_cont_rate,
            "at_count":self.at_count,
            "double_slash_count":self.double_slash_count,
            "hyphen_count":self.hyphen_count,
            "dot_count":self.dot_count,
            "symbols_count":self.symbols_count,
            "tld_count":self.tld_count,
            "domain_digit_count":self.domain_digit_count,
            "param_digit_count":self.param_digit_count,
            "percent_count":self.percent_count
            
            }

In [4]:
example_url = "http://1g@n.com/Recipe/Pesto-Chicken/Detail.aspx?evt19=1&referringHubId=15041"
url_obj = URLPreprocessor(example_url,1)
column_list = url_obj.process().keys()
url_obj.process()

{'url': 'http://1g@n.com/Recipe/Pesto-Chicken/Detail.aspx?evt19=1&referringHubId=15041',
 'benign': 1,
 'number_rate_file_name': 0.0,
 'domain_url_ratio': 0.1038961038961039,
 'number_rate_url': 0.11688311688311688,
 'path_domain_ratio': 4.125,
 'number_rate_after_path': 0,
 'avg_path_token_len': 7.5,
 'params_path_ratio': 0.8484848484848485,
 'params_url_ratio': 0.36363636363636365,
 'params_domain_ratio': 3.5,
 'path_url_ratio': 0.42857142857142855,
 'entropy': 8.042272405884228,
 'char_cont_rate': 0.625,
 'at_count': 1,
 'double_slash_count': 1,
 'hyphen_count': 1,
 'dot_count': 2,
 'symbols_count': 9,
 'tld_count': 1,
 'domain_digit_count': 1,
 'param_digit_count': 8,
 'percent_count': 0}

# data processing

## load all csv files

### load benign dataset

In [122]:
benign_filenname = "./ISCXURL2016/FinalDataset/URL/Benign_list_big_final.csv"
benign_urls = []
with open(benign_filenname, 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    for line in csv_reader:
        benign_urls.append(line[0])

benign_df = pd.DataFrame({"url":benign_urls})

In [123]:
benign_df['benign'] = 1

In [124]:
benign_df

Unnamed: 0,url,benign
0,http://1337x.to/torrent/1048648/American-Snipe...,1
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,1
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,1
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,1
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,1
...,...,...
35373,https://lastpass.com/signup2.php?ac=1&from_uri...,1
35374,https://lastpass.com/signup2.php?ac=1&from_uri...,1
35375,https://lastpass.com/signup2.php?ac=1&from_uri...,1
35376,https://lastpass.com/signup2.php?ac=1&from_uri...,1


### load malicious dataset

In [125]:
directory_path = "./ISCXURL2016/FinalDataset/URL/malicious/"

malicious_filenames = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]

In [126]:
urls = []

for name in malicious_filenames:
    with open(directory_path+name, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        for line in csv_reader:
            urls.append(line[0])
        


In [127]:
malicious_df = pd.DataFrame({"url":urls})
malicious_df['benign'] = 0

In [128]:
df_concat = pd.concat([benign_df,malicious_df],ignore_index=True)

In [133]:
df_concat.to_csv("ISCXURL2016_dataset.csv",index=False)

In [1]:
# df = pd.read_csv('rlilojr.csv')

In [2]:
# df.groupby(['label']).count()

In [13]:
df_malicious = df.loc[df['label'] == 1]

In [None]:
df_benign = df.loc[df["label"] == 0]

In [5]:
df_concat = pd.read_csv('dataset_rlilojr.csv')
df_concat.drop(columns=['Unnamed: 0'],inplace=True)

In [7]:
df_concat

Unnamed: 0,url,benign
0,http://br-ofertasimperdiveis.epizy.com/produto...,0
1,https://semana-da-oferta.com/produtos.php?id=5...,0
2,https://scrid-apps-creacust-sslhide90766752024...,0
3,http://my-softbank-security.com/wap_login.htm,0
4,http://www.my-softbank-security.com/wap_login.htm,0
...,...,...
126932,sharebabe.com,1
126933,toysrusinc.com,1
126934,mrmoustache.co,1
126935,freeblog.hu,1


In [164]:
df = pd.DataFrame(columns=column_list)
length = len(df_concat)
for index, row in df_concat[1126:1127].iterrows():
    print(row)
    try:
        url_obj = URLPreprocessor(row['url'],row['benign'])
        new_row = url_obj.process()
        new_row['url'] = 'http://diskobil.dk/7X3qtDnlz8mq9L'
        df.loc[len(df)] = new_row
        print(new_row)
    except Exception as e:
        print('\n',e)
        print(row['url'],index)
        break
    
    print(f"Progress: {index}/{length}", end='\r')

Unnamed: 0                                  1126
url           http://diskobil.dk/7X3qtDnlz8mq9L/
benign                                         0
Name: 1126, dtype: object
{'url': 'http://diskobil.dk/7X3qtDnlz8mq9L', 'benign': 0, 'number_rate_file_name': 0.25, 'domain_url_ratio': 0.3235294117647059, 'number_rate_url': 0.11764705882352941, 'path_domain_ratio': 1.4545454545454546, 'number_rate_after_path': 0, 'avg_path_token_len': 4.666666666666667, 'params_path_ratio': 0.0, 'params_url_ratio': 0.0, 'params_domain_ratio': 0.0, 'path_url_ratio': 0.47058823529411764, 'entropy': 1.6289141686655357, 'char_cont_rate': 0.8181818181818182, 'at_count': 0, 'double_slash_count': 1, 'hyphen_count': 0, 'dot_count': 1, 'symbols_count': 4, 'tld_count': 1, 'domain_digit_count': 0, 'param_digit_count': 0, 'percent_count': 0}
Progress: 1126/126937

In [165]:
df.to_csv("dataset_processed_lil.csv",index=False,encoding='utf-8',sep=',')

In [166]:
df_r = pd.read_csv("dataset_processed_lil.csv")

In [140]:
row = df_concat.iloc[1126]
url_obj = URLPreprocessor(row['url'],row['benign'])
new_row = url_obj.process()
new_row

{'url': 'http://diskobil.dk/7X3qtDnlz8mq9L/',
 'benign': 0,
 'number_rate_file_name': 0.25,
 'domain_url_ratio': 0.3235294117647059,
 'number_rate_url': 0.11764705882352941,
 'path_domain_ratio': 1.4545454545454546,
 'number_rate_after_path': 0,
 'avg_path_token_len': 4.666666666666667,
 'params_path_ratio': 0.0,
 'params_url_ratio': 0.0,
 'params_domain_ratio': 0.0,
 'path_url_ratio': 0.47058823529411764,
 'entropy': 1.6289141686655357,
 'char_cont_rate': 0.8181818181818182,
 'at_count': 0,
 'double_slash_count': 1,
 'hyphen_count': 0,
 'dot_count': 1,
 'symbols_count': 4,
 'tld_count': 1,
 'domain_digit_count': 0,
 'param_digit_count': 0,
 'percent_count': 0}

In [139]:
df2 = pd.DataFrame(new_row)

ValueError: If using all scalar values, you must pass an index

In [3]:
file = "rlilojr.csv"
df = pd.read_csv(file)

In [4]:
df.groupby(['label']).count()

Unnamed: 0_level_0,url
label,Unnamed: 1_level_1
0,1000000
1,56937


In [5]:
df_mal = df.loc[df['label'] == 1]

In [6]:
df_ben = df.loc[df['label'] == 0].sample(frac=0.2, random_state=36)

In [7]:
df_mal['benign'] = 0
df_ben['benign'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mal['benign'] = 0


In [9]:

df_mal.drop(columns=['label'],inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mal.drop(columns=['label'],inplace=True)


In [10]:
df_ben.drop(columns=['label'],inplace=True)

In [11]:
df = pd.concat([df_mal,df_ben],ignore_index=True)

In [16]:
df_mal

Unnamed: 0,url,benign
0,http://br-ofertasimperdiveis.epizy.com/produto...,0
1,https://semana-da-oferta.com/produtos.php?id=5...,0
2,https://scrid-apps-creacust-sslhide90766752024...,0
3,http://my-softbank-security.com/wap_login.htm,0
4,http://www.my-softbank-security.com/wap_login.htm,0
...,...,...
56932,https://www.qdpro.com.ua/download/data/tswreports,0
56933,https://www.ravenbiotech.com/cgi-bin.134157227...,0
56934,https://www.ravenbiotech.com/cgi-bin.134157227...,0
56935,https://www.relojesariel.com.ar/InstMidasMS/sp,0


In [20]:
df_mal.to_csv(f'dataset_malicious_rlilojr.csv',index=False)

In [24]:
df_data = pd.read_csv('data2.csv')

In [31]:
df_data.replace(columns={df_data.columns[1]:"url"},inplace=True)

TypeError: replace() got an unexpected keyword argument 'columns'

Unnamed: 0,hottraveljobs.com/forum/docs/info.php,bad
0,news.grouptumbler.com/news/feed.php,url
1,info.leveldelta.com/php/text.php,url
2,citroen-club.ch/n.exe,url
3,zehir4.asp,url
4,ZHC_Shell_1.0.aspx,url
...,...,...
32870,23.227.196.215/,url
32871,apple-checker.org/,url
32872,apple-iclods.org/,url
32873,apple-uptoday.org/,url


# data dataset

In [21]:
filename_data = "data.csv"
df_data = pd.read_csv(filename_data)

In [24]:
df_data.groupby(['label']).count()

Unnamed: 0_level_0,url
label,Unnamed: 1_level_1
bad,75643
good,344821


In [32]:
df_data_malicious = df_data.loc[(df_data['label'] == 'bad') & (df_data['url'].str.len() > 30)]
df_data_benign = df_data.loc[(df_data['label'] == 'good') & (df_data['url'].str.len() > 30)]

In [34]:
df_data_benign['benign'] = 1
df_data_malicious['benign'] = 0
df_data_benign.drop(columns=['label'],inplace=True)
df_data_malicious.drop(columns=['label'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_benign['benign'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_malicious['benign'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_benign.drop(columns=['label'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable

In [36]:
df_data_malicious

Unnamed: 0,url,benign
8,crackspider.us/toolbar/install.php?pack=exe,0
11,svision-online.de/mgfi/administrator/component...,0
12,officeon.ch.ma/office.js?google_ad_format=728x...,0
18,freeserials.spb.ru/key/68703.htm,0
20,orbowlada.strefa.pl/text396.htm,0
...,...,...
420453,stefanocardone.com/wp-includes/SimplePie/HTTP/...,0
420454,defibel.org/wp-includes/images/index.html,0
420455,shapingsoftware.com/2009/02/09/architectural-s...,0
420456,free.ulohapp.info/?br_fl=2872&amp;tuif=5539&am...,0


In [37]:
df_data_concat = pd.concat([df_data_benign,df_data_malicious],ignore_index=True)

## data 2

In [38]:
df_data_2 = pd.read_csv("data2.csv")

In [41]:
df_data_2.rename(columns={df_data_2.columns[0]:"url","bad":"label"},inplace=True)

In [43]:
df_data_malicious = df_data_2.loc[df_data_2['label'] == 'bad']
df_data_benign = df_data_2.loc[(df_data_2['label'] == 'good') & (df_data_2['url'].str.len() > 30)]
df_data_benign['benign'] = 1
df_data_malicious['benign'] = 0
df_data_benign.drop(columns=['label'],inplace=True)
df_data_malicious.drop(columns=['label'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_benign.drop(columns=['label'],inplace=True)


In [47]:
df_data_2.groupby(['label']).count()

Unnamed: 0_level_0,url
label,Unnamed: 1_level_1
bad,32875


In [49]:
df_data_concat = pd.concat([df_data_concat,df_data_malicious],ignore_index=True) 

In [52]:
df_data_concat.to_csv("datav1v2_dataset.csv")

# Webpages classification data 2GB

In [53]:
file = "Webpages_Classification_train_data.csv"
df = pd.read_csv(file)

In [55]:
df.drop(columns=["Unnamed: 0"],inplace=True)

In [64]:
cols = df.columns[1:10]
cols

Index(['url_len', 'ip_add', 'geo_loc', 'tld', 'who_is', 'https', 'js_len',
       'js_obf_len', 'content'],
      dtype='object')

In [65]:
df.drop(columns=cols,inplace=True)

In [67]:
df.groupby(['label']).count()


Unnamed: 0_level_0,url
label,Unnamed: 1_level_1
bad,27253
good,1172747


In [68]:
df

Unnamed: 0,url,label
0,http://members.tripod.com/russiastation/,good
1,http://www.ddj.com/cpp/184403822,good
2,http://www.naef-usa.com/,good
3,http://www.ff-b2b.de/,bad
4,http://us.imdb.com/title/tt0176269/,good
...,...,...
1199995,http://csrc.nist.gov/rbac/,good
1199996,http://www.unm.edu/~hist/,good
1199997,http://www.syfyportal.com/news423380.html,good
1199998,http://www.wardkenpo.ie,good


In [69]:
df_benign = df.loc[df['label'] == 'good']
df_malicious= df.loc[df['label'] == 'bad']

In [70]:
df_malicious['benign'] = 0
df_benign['benign'] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_malicious['benign'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_benign['benign'] = 1


In [72]:
df_benign.drop(columns=['label'],inplace=True)
df_malicious.drop(columns=['label'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_benign.drop(columns=['label'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_malicious.drop(columns=['label'],inplace=True)


In [75]:
df_benign = df_benign.sample(frac=0.17, random_state=41)

In [77]:
df_concat = pd.concat([df_benign,df_malicious],ignore_index=True)

In [78]:
df_concat.to_csv("websites_2gb_dataset.csv",index=False)

# malicious phishtank

In [79]:
file = './malicious_phish.csv/malicious_phish.csv'
df = pd.read_csv(file)

In [81]:
df.groupby('type').count()

Unnamed: 0_level_0,url
type,Unnamed: 1_level_1
benign,428103
defacement,96457
malware,32520
phishing,94111


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_benign['benign'] = 1


In [85]:
df.loc[df['type'] != 'benign',['benign']] = 0

In [86]:
df.loc[df['type'] == 'benign',['benign']] = 1

In [89]:
df.drop(columns=['type'],inplace=True)

In [96]:
df = df.astype({"benign":int})

In [98]:
df.groupby('benign').count()

Unnamed: 0_level_0,url
benign,Unnamed: 1_level_1
0,223088
1,428103


In [102]:
df.to_csv('malicious_phish_dataset.csv',index=False)

# phishtank verfied online csv

In [114]:
file = './verified_online.csv/verified_online.csv'
df = pd.read_csv(file)

In [115]:
df = df['url']

In [116]:
df = pd.DataFrame(df)

In [118]:
df['benign'] = 0

In [119]:
df

Unnamed: 0,url,benign
0,https://editor.domainspot.com/viewer/vbid-516a...,0
1,https://heypouch.com,0
2,https://apps.earthflora.net,0
3,https://dbs.ibaankiing-mobiile.com/,0
4,http://site9614304.92.webydo.com/?v=1,0
...,...,...
40241,http://fotovideobeny.pl/join/webscr.htm,0
40242,http://www.formbuddy.com/cgi-bin/formdisp.pl?u...,0
40243,http://www.formbuddy.com/cgi-bin/formdisp.pl?u...,0
40244,http://www.habbocreditosparati.blogspot.com/,0


In [120]:
df.to_csv('phishtank_verified_dataset.csv')

In [121]:
df

Unnamed: 0,url,benign
0,https://editor.domainspot.com/viewer/vbid-516a...,0
1,https://heypouch.com,0
2,https://apps.earthflora.net,0
3,https://dbs.ibaankiing-mobiile.com/,0
4,http://site9614304.92.webydo.com/?v=1,0
...,...,...
40241,http://fotovideobeny.pl/join/webscr.htm,0
40242,http://www.formbuddy.com/cgi-bin/formdisp.pl?u...,0
40243,http://www.formbuddy.com/cgi-bin/formdisp.pl?u...,0
40244,http://www.habbocreditosparati.blogspot.com/,0
