In [37]:
import re
import ast
import json
import pandas as pd
from tqdm import tqdm
from pprint import pprint

from urllib.parse import urlparse
from url_parser import parse_url, get_url, get_base_url

import nltk
from nltk.corpus import stopwords

from IPython.display import display, HTML

### 1. Break large file to 9 smaller ones: 
Run this in terminal:

```split -l 500000 -a 4 dataset.json smaller_```

Rename the files to `1 - 9.json`

### 2. Convert json to csv file with selected column names
1. first convert all the individual json files to csv files
2. get all the column names in all the files
3. select which colums to use, and save those files to a csv folder

In [15]:
def convert_json_csv():
    for i in tqdm(range(1, 10)):
        data = pd.read_json(f'../data/json/{i}.json', lines = True)
        data.to_csv(f'../data/json/{i}.csv')

def get_cols_name():
    cols = set()
    for i in tqdm(range(1, 10)):
        data = pd.read_csv(f'../data/csv/{i}.csv', low_memory = False)
        cols.update(data.columns.values.tolist())
    return cols

def use_cols_csv():
    for i in tqdm(range(1, 10)):
        data = pd.read_csv(
            f'../data/json/{i}.csv', 
            low_memory = False, 
            usecols = ['browserFamily', 'channel', 'deviceType', 'iabCategories', 'os', 'refDomain', 'url']
        )
        data.to_csv(f'../data/csv/{i}.csv')

def json_to_csv(print_cols = False):
    convert_json_csv()
    if print_cols:
        cols = get_cols_name()
        pprint(cols)
    use_cols_csv()

In [16]:
json_to_csv()

100%|█████████████████████████████████████████████| 9/9 [03:44<00:00, 24.97s/it]
100%|█████████████████████████████████████████████| 9/9 [00:35<00:00,  3.94s/it]


### 3. Clean csv file
1. seperate labels and no labels
2. save the no labels in a seperate folder
3. explode/expand the iabcategories for each url -> one line per iabcategory per url
4. add the scores (high, medium, low) for each
5. break url into base, domain, path
6. combine the channel, domain, path together in one 
7. remove stop words (open source and custom)

In [19]:
custom_stopwords = ['www', 'www.', 'com', 'les', 'org', 'tag', 'html', 'id', 'un', 'win', 
                    'en', 'me', 'php', 'asp', 'aspx', 'cc', 'net']

def get_iab_categories(iab_categories):
    iab_categories_list = []
    iab_categories = ast.literal_eval(iab_categories)
    for iab_category in iab_categories:
        for value in list(iab_category.values())[:-1]:
            iab_categories_list.append([value, iab_category['score']])
    return iab_categories_list

def get_base_url_(url):
    try:
        return get_base_url(url)
    except:
        return url

def get_domain(url):
    try:
        domain = parse_url(url)['domain']
    except:
        domain = ' '.join(urlparse(url).netloc.split('.'))
    try:
        top_domain = parse_url(url)['top_domain'] or ''
    except:
        top_domain = ''
    try:
        sub_domain = parse_url(url)['sub_domain'] or ''
    except:
        sub_domain = ''
    return domain + ' ' + sub_domain + ' ' + top_domain 
    
def get_path(url):
    try:
        path = parse_url(url)['path']
    except:
        path = urlparse(url).path
    return path

def remove_stop_words(text):
    stop = []
    stop.extend(stopwords.words('english'))
    stop.extend(stopwords.words('french'))
    stop.extend(stopwords.words('spanish'))
    stop.extend(custom_stopwords)
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip()
    text = ' '.join([word for word in text.split() if word not in (stop)])
    
    return text
    
def clean_df(df, df_name):    
    df_nolabel = df[df['iabCategories'].isnull()]
    df_nolabel.to_csv(f'../data/clean/no_label/{df_name}.csv')
    
    df = df[~df['iabCategories'].isnull()]
    df = df.fillna('')
    df['iabCategories'] = df['iabCategories'].apply(get_iab_categories)
    df = df.explode('iabCategories', ignore_index = True)
    split = pd.DataFrame(df['iabCategories'].to_list(), columns = ['iab_categories', 'confidence'])
    df = pd.concat([df, split], axis = 1)
    df = df.drop('iabCategories', axis = 1)
    df['base_url'] = df['url'].apply(get_base_url_)
    df['domain'] = df['url'].apply(get_domain) 
    df['path'] = df['url'].apply(get_path)
    df['combine'] = df['channel'] + ' ' + df['domain'] + ' ' + df['path']
    df['combine'] = df['combine'].apply(remove_stop_words)
    df.to_csv(f'../data/clean/label/{df_name}.csv')
    
def clean():
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/csv/{i}.csv')
        clean_df(df, i)

In [20]:
clean()

100%|█████████████████████████████████████████| 9/9 [4:04:58<00:00, 1633.20s/it]


### 4. Get data ready for training

1. look at all the columns
2. get rid of unnessary cols
3. look at all the unique values in certain columns
4. replace them with numerical classes
5. join all csv files into one

In [67]:
def get_cols_name():
    cols = set()
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/clean/label/{i}.csv', low_memory = False)
        cols.update(df.columns.values.tolist())
    return cols

def drop_cols():
    cols_to_drop = ['Unnamed: 0', 'base_url', 'channel', 'domain', 'path', 'refDomain', 'url']
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/clean/label/{i}.csv', low_memory = False)
        df = df.rename(columns = {'Unnamed: 0.1': 'id'})
        df = df.set_index('id')
        df = df.drop(cols_to_drop, axis = 1)
        df.to_csv(f'../data/train/{i}.csv')
        
def get_unique_vals():
    unique_vals = {
        'browserFamily': set(),
        'deviceType': set(),
        'os': set(),
        'confidence': set()
    }
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/train/{i}.csv', low_memory = False)
        for col in unique_vals.keys():
            unique_vals[col].update(df[col].unique())
    return unique_vals
        
    
def display_df():
    for i in range(1, 10):
        df = pd.read_csv(f'../data/train/{i}.csv', low_memory = False)
        display(df.head())
        print()
        
def combine():
    dfs = []
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/train/{i}.csv', low_memory = False)
        dfs.append(df)
    combined_df = pd.concat(dfs, axis=0, ignore_index=True)
    combined_df.to_csv('../data/train/train.csv', index=False)

def get_info():
    df = pd.read_csv('../data/train/train.csv', low_memory = False)
    print(df.shape)

In [68]:
# get_cols_name
# drop_cols()
# unique_vals = get_unique_vals()
# combine()
get_info()

(9143493, 7)


In [59]:
unique_vals['os']

{'Android',
 'Android 1.0',
 'Android 1.5 Cupcake',
 'Android 1.6 Donut',
 'Android 2.0/1 Eclair',
 'Android 2.2.x Froyo',
 'Android 2.3.x Gingerbread',
 'Android 3.x Honeycomb',
 'Android 4.0.x Ice Cream Sandwich',
 'Android 4.1.x Jelly Bean',
 'Android 4.2 Jelly Bean',
 'Android 4.3 Jelly Bean',
 'Android 4.4 KitKat',
 'BlackBerry OS',
 'Chrome OS',
 'FreeBSD',
 'JVM (Java)',
 'JVM (Platform Micro Edition)',
 'Linux',
 'Linux (Arch Linux)',
 'Linux (Fedora)',
 'Linux (Gentoo)',
 'Linux (Slackware)',
 'Linux (Ubuntu)',
 'Mac OS',
 'MeeGo',
 'NetBSD',
 'OS X',
 'OS X 10.10 Yosemite',
 'OS X 10.4 Tiger',
 'OS X 10.5 Leopard',
 'OS X 10.6 Snow Leopard',
 'OS X 10.7 Lion',
 'OS X 10.8 Mountain Lion',
 'OS X 10.9 Mavericks',
 'Orbis OS',
 'Palm OS',
 'Solaris',
 'Symbian OS',
 'Ubuntu Touch',
 'Windows',
 'Windows 2000',
 'Windows 2003 Server',
 'Windows 7',
 'Windows 8',
 'Windows 8.1',
 'Windows ME',
 'Windows Mobile',
 'Windows NT',
 'Windows Phone 7',
 'Windows Phone 8',
 'Windows Phon

In [58]:
browserFamily = {'AdsBot-Google',
 'Android HttpURLConnection',
 'Android browser',
 'Apple Mail',
 'BlackBerry Browser',
 'Blazer',
 'Bolt',
 'Camino',
 'Chrome',
 'Chrome Mobile',
 'Chromium',
 'Epiphany',
 'FacebookExternalHit/1.1',
 'Firebird (old name for Firefox)',
 'Firefox',
 'Firefox (Minefield)',
 'Galeon',
 'Gmail image proxy',
 'Google Wireless Transcoder',
 'Googlebot-Mobile',
 'Googlebot/2.1',
 'IBrowse',
 'IE',
 'IE Mobile',
 'IceApe',
 'IceDragon',
 'IceWeasel',
 'Iron',
 'Java',
 'Kindle Browser',
 'Konqueror',
 'Maxthon',
 'Maxthon mobile',
 'Mediapartners-Google',
 'Mobile Firefox',
 'Mobile Safari',
 'Mozilla',
 'NetFront',
 'Netscape Navigator',
 'Nokia Web Browser',
 'Opera',
 'Opera Mini',
 'Opera Mobile',
 'Outlook 2010',
 'PS4 Web browser',
 'Pale Moon',
 'Polaris',
 'Puffin',
 'Python-requests',
 'Safari',
 'SeaMonkey',
 'Silk',
 'Sogou Explorer',
 'Thunderbird',
 'UC Browser',
 'Waterfox',
 'Yandex.Browser',
 'Yandex.Browser mobile',
 'unknown'}

In [61]:
deviceType = {'Game console',
 'Other',
 'PDA',
 'Personal computer',
 'Smart TV',
 'Smartphone',
 'Tablet',
 'nan'}

In [None]:
os = {'Android',
 'Android 1.0',
 'Android 1.5 Cupcake',
 'Android 1.6 Donut',
 'Android 2.0/1 Eclair',
 'Android 2.2.x Froyo',
 'Android 2.3.x Gingerbread',
 'Android 3.x Honeycomb',
 'Android 4.0.x Ice Cream Sandwich',
 'Android 4.1.x Jelly Bean',
 'Android 4.2 Jelly Bean',
 'Android 4.3 Jelly Bean',
 'Android 4.4 KitKat',
      
 'BlackBerry OS',
      
 'Chrome OS',
      
 'FreeBSD',
 'JVM (Java)',
 'JVM (Platform Micro Edition)',
 'Orbis OS',
 'Palm OS',
 'Solaris',
 'Symbian OS',
 'MeeGo',
 'NetBSD',
      
 'Linux',
 'Linux (Arch Linux)',
 'Linux (Fedora)',
 'Linux (Gentoo)',
 'Linux (Slackware)',
 'Linux (Ubuntu)',
 
 'Mac OS',
 'OS X',
 'OS X 10.10 Yosemite',
 'OS X 10.4 Tiger',
 'OS X 10.5 Leopard',
 'OS X 10.6 Snow Leopard',
 'OS X 10.7 Lion',
 'OS X 10.8 Mountain Lion',
 'OS X 10.9 Mavericks',
      
 'Ubuntu Touch',
      
 'Windows',
 'Windows 2000',
 'Windows 2003 Server',
 'Windows 7',
 'Windows 8',
 'Windows 8.1',
 'Windows ME',
 'Windows Mobile',
 'Windows NT',
 'Windows Phone 7',
 'Windows Phone 8',
 'Windows Phone 8.1',
 'Windows RT',
 'Windows Vista',
 'Windows XP',
      
 'iOS',
 'iOS 4',
 'iOS 5',
 'iOS 6',
 'iOS 7',
 'iOS 8',
      
 'unknown'}