In [6]:
import re
import ast
import json
import pickle
import pandas as pd
from tqdm import tqdm
from pprint import pprint

from urllib.parse import urlparse
from url_parser import parse_url, get_url, get_base_url

import nltk
from nltk.corpus import stopwords

from IPython.display import display

from sklearn.preprocessing import LabelEncoder

### 1. Break large file to 9 smaller ones: 
Run this in terminal:

```split -l 500000 -a 4 dataset.json smaller_```

Rename the files to `1 - 9.json`

### 2. Convert json to csv file with selected column names
1. first convert all the individual json files to csv files
2. get all the column names in all the files
3. select which colums to use, and save those files to a csv folder

In [15]:
def convert_json_csv():
    for i in tqdm(range(1, 10)):
        data = pd.read_json(f'../data/json/{i}.json', lines = True)
        data.to_csv(f'../data/json/{i}.csv')

def get_cols_name():
    cols = set()
    for i in tqdm(range(1, 10)):
        data = pd.read_csv(f'../data/csv/{i}.csv', low_memory = False)
        cols.update(data.columns.values.tolist())
    return cols

def use_cols_csv():
    for i in tqdm(range(1, 10)):
        data = pd.read_csv(
            f'../data/json/{i}.csv', 
            low_memory = False, 
            usecols = ['browserFamily', 'channel', 'deviceType', 'iabCategories', 'os', 'refDomain', 'url']
        )
        data.to_csv(f'../data/csv/{i}.csv')

def json_to_csv(print_cols = False):
    convert_json_csv()
    if print_cols:
        cols = get_cols_name()
        pprint(cols)
    use_cols_csv()

In [16]:
json_to_csv()

100%|█████████████████████████████████████████████| 9/9 [03:44<00:00, 24.97s/it]
100%|█████████████████████████████████████████████| 9/9 [00:35<00:00,  3.94s/it]


### 3. Clean csv file
1. seperate labels and no labels
2. save the no labels in a seperate folder
3. explode/expand the iabcategories for each url -> one line per iabcategory per url
4. add the scores (high, medium, low) for each
5. break url into base, domain, path
6. combine the channel, domain, path together in one 
7. remove stop words (open source and custom)

In [19]:
custom_stopwords = ['www', 'www.', 'com', 'les', 'org', 'tag', 'html', 'id', 'un', 'win', 
                    'en', 'me', 'php', 'asp', 'aspx', 'cc', 'net']

def get_iab_categories(iab_categories):
    iab_categories_list = []
    iab_categories = ast.literal_eval(iab_categories)
    for iab_category in iab_categories:
        for value in list(iab_category.values())[:-1]:
            iab_categories_list.append([value, iab_category['score']])
    return iab_categories_list

def get_base_url_(url):
    try:
        return get_base_url(url)
    except:
        return url

def get_domain(url):
    try:
        domain = parse_url(url)['domain']
    except:
        domain = ' '.join(urlparse(url).netloc.split('.'))
    try:
        top_domain = parse_url(url)['top_domain'] or ''
    except:
        top_domain = ''
    try:
        sub_domain = parse_url(url)['sub_domain'] or ''
    except:
        sub_domain = ''
    return domain + ' ' + sub_domain + ' ' + top_domain 
    
def get_path(url):
    try:
        path = parse_url(url)['path']
    except:
        path = urlparse(url).path
    return path

def remove_stop_words(text):
    stop = []
    stop.extend(stopwords.words('english'))
    stop.extend(stopwords.words('french'))
    stop.extend(stopwords.words('spanish'))
    stop.extend(custom_stopwords)
    text = re.sub('[^A-Za-z]+', ' ', str(text)).strip()
    text = ' '.join([word for word in text.split() if word not in (stop)])
    
    return text
    
def clean_df(df, df_name):    
    df_nolabel = df[df['iabCategories'].isnull()]
    df_nolabel.to_csv(f'../data/clean/no_label/{df_name}.csv')
    
    df = df[~df['iabCategories'].isnull()]
    df = df.fillna('')
    df['iabCategories'] = df['iabCategories'].apply(get_iab_categories)
    df = df.explode('iabCategories', ignore_index = True)
    split = pd.DataFrame(df['iabCategories'].to_list(), columns = ['iab_categories', 'confidence'])
    df = pd.concat([df, split], axis = 1)
    df = df.drop('iabCategories', axis = 1)
    df['base_url'] = df['url'].apply(get_base_url_)
    df['domain'] = df['url'].apply(get_domain) 
    df['path'] = df['url'].apply(get_path)
    df['combine'] = df['channel'] + ' ' + df['domain'] + ' ' + df['path']
    df['combine'] = df['combine'].apply(remove_stop_words)
    df.to_csv(f'../data/clean/label/{df_name}.csv')
    
def clean():
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/csv/{i}.csv')
        clean_df(df, i)

In [20]:
clean()

100%|█████████████████████████████████████████| 9/9 [4:04:58<00:00, 1633.20s/it]


### 4. Get data ready for training

1. look at all the columns
2. get rid of unnessary cols
3. look at all the unique values in certain columns
4. replace them with numerical classes
5. join all csv files into one

In [13]:
def get_cols_name():
    cols = set()
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/clean/label/{i}.csv', low_memory = False)
        cols.update(df.columns.values.tolist())
    return cols

def drop_cols():
    cols_to_drop = ['Unnamed: 0', 'base_url', 'channel', 'domain', 'path', 'refDomain', 'url']
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/clean/label/{i}.csv', low_memory = False)
        df = df.rename(columns = {'Unnamed: 0.1': 'id'})
        df = df.set_index('id')
        df = df.drop(cols_to_drop, axis = 1)
        df.to_csv(f'../data/train/{i}.csv')
        
def get_unique_vals():
    unique_vals = {
        'browserFamily': set(),
        'deviceType': set(),
        'os': set(),
        'confidence': set()
    }
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/train/{i}.csv', low_memory = False)
        for col in unique_vals.keys():
            unique_vals[col].update(df[col].unique())
    return unique_vals
        
    
def display_df():
    for i in range(1, 10):
        df = pd.read_csv(f'../data/train/{i}.csv', low_memory = False)
        display(df.head())
        print()
        
def combine():
    dfs = []
    for i in tqdm(range(1, 10)):
        df = pd.read_csv(f'../data/train/{i}.csv', low_memory = False)
        dfs.append(df)
    combined_df = pd.concat(dfs, axis=0, ignore_index=True)
    combined_df.to_csv('../data/train/train_combined.csv', index=False)

def group_val():
    with open('../data/static_data/browser_family.pkl', 'rb') as f:
        browser_family = pickle.load(f)
    with open('../data/static_data/os.pkl', 'rb') as f:
        os = pickle.load(f)
    df = pd.read_csv('../data/train/train_combined.csv', low_memory = False)
    df['browserFamily'] = df['browserFamily'].apply(lambda x: browser_family[x])
    df['os'] = df['os'].apply(lambda x: os[x])
    df.to_csv('../data/train/train_grouped.csv', index = False)
    
def replace_val():
    df = pd.read_csv('../data/train/train_grouped.csv', low_memory = False)
    
    le = LabelEncoder()
    le.fit(df['browserFamily'])
    le_browser_family_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    with open('../data/static_data/browser_family_mapping.pkl', 'wb') as f:
        pickle.dump(le_browser_family_mapping, f)
    df['browserFamily'] = le.transform(df['browserFamily'])
    
    le = LabelEncoder()
    le.fit(df['deviceType'])
    le_device_type_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    with open('../data/static_data/device_type_mapping.pkl', 'wb') as f:
        pickle.dump(le_device_type_mapping, f)
    df['deviceType'] = le.transform(df['deviceType'])
    
    le = LabelEncoder()
    le.fit(df['os'])
    le_os_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    with open('../data/static_data/os_mapping.pkl', 'wb') as f:
        pickle.dump(le_os_mapping, f)
    df['os'] = le.transform(df['os'])
    
    le = LabelEncoder()
    le.fit(df['iab_categories'])
    le_iab_categories_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    with open('../data/static_data/iab_categories_mapping.pkl', 'wb') as f:
        pickle.dump(le_iab_categories_mapping, f)
    df['iab_categories'] = le.transform(df['iab_categories'])
    
    df['confidence'] = df['confidence'].replace({'medium': 0.5, 'high': 1.0,})

    df.to_csv('../data/train/train.csv', index = False)

def get_train(show_cols = False, show_unique_vals = False):
    if show_cols:
        pprint(get_cols_name())
    drop_cols()
    if show_unique_vals:
        pprint(get_unique_vals())
    combine()
    replace_val()

In [14]:
get_train()

In [16]:
df = pd.read_csv('../data/train/train.csv')
df.sample(n = 10)

Unnamed: 0,id,browserFamily,deviceType,os,iab_categories,confidence,combine
1051580,15020,3,5,3,157,1.0,adj tdd st reward link IGsocial Aet Plg
7085215,871980,0,3,7,129,1.0,thejigsawpuzzles Animals Forest Scene jigsaw p...
4269724,128325,3,5,0,150,1.0,facebook usatoday billswire lists nfl teams su...
5227147,48105,0,3,5,161,1.0,oclc accessmedicine mhmedical uiwtx idm infogr...
8426897,138526,3,5,0,161,0.5,nodq news update former wwe star potentially r...
6961572,748337,3,5,3,60,1.0,google search geediting men low emotional inte...
4877680,736281,3,5,3,13,1.0,revolvermag
3632183,526441,3,5,0,13,1.0,foursome
5319445,140403,3,5,3,150,0.5,scenepensacola shows
8105149,853234,3,5,3,110,1.0,amac us poll concerning unsettling country
