## Data preparation
We will be experimenting with preprocessing the datasets for training, grouping them for the experiments and potentially training some deep models to evaluate. and grouping them into the folowing three categorie

### Dataset preprocessing
We will be loading the extracted features and concatenate them with the expected labels, discarding the unneeded columns, one-hot-encoding categorical features and grouping records based on a few criteria. 

Regarding grouping the records, for each site, we will be running a model training experiment on the following three datasets, for the tag classification task:

1. Only pages that contain the given tag on that website
2. All pages on the website, regardless if they contain the tag or not
3. Full dataset of all tags

And, considering generalisation revolves around predicting these structured on previously unseen websites, we will be testing on subsets of them all, for each training.

**TODO:** Due to the size of the dataset, we will only be using `olx.ro` in this experiment as the memory doesn't allow to work with a larger one.

In [2]:
%matplotlib inline
# standard library
import itertools
import ast

from urllib.parse import urlparse

# pandas
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# skelearn
from sklearn.preprocessing import LabelBinarizer

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [3]:
# read ony the header of the csv
head_df = pd.read_csv('../data/ecommerce-features.csv', nrows=0, index_col=0)

# get the one-hot encodable cols
tag_cols = head_df.filter(regex='^(ancestor._)?tag$')
one_hot_pre_df = pd.read_csv('../data/ecommerce-features.csv', usecols=tag_cols)

# get the frequency-encodable cols
freq_cols = head_df.filter(regex='^(descendant._)?tags$')
freq_pre_df = pd.read_csv('../data/ecommerce-features.csv', usecols=freq_cols)

In [4]:
one_hot_pre_df.head()

Unnamed: 0,tag,ancestor1_tag,ancestor2_tag,ancestor3_tag,ancestor4_tag,ancestor5_tag
0,html,,,,,
1,head,html,,,,
2,meta,head,html,,,
3,script,head,html,,,
4,title,head,html,,,


In [5]:
label_df = pd.read_csv('../data/ecommerce-labels.csv', index_col=0)
label_df.head()

  mask |= (ar1 == a)


Unnamed: 0,detail_price_label,detail_title_label,detail_description_label,detail_image_label,list_container_label,list_title_label,list_price_label,list_image_label
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False


In [6]:
url_df = pd.read_csv('../data/ecommerce-features.csv', usecols=['url', 'domain'])
url_df.head()

Unnamed: 0,url,domain
0,https://marketplace-leads.emag.ro/?utm_source=...,marketplace-leads.emag.ro
1,https://marketplace-leads.emag.ro/?utm_source=...,marketplace-leads.emag.ro
2,https://marketplace-leads.emag.ro/?utm_source=...,marketplace-leads.emag.ro
3,https://marketplace-leads.emag.ro/?utm_source=...,marketplace-leads.emag.ro
4,https://marketplace-leads.emag.ro/?utm_source=...,marketplace-leads.emag.ro


In [7]:
# remove classes columns
class_cols = head_df.filter(regex='^((descendant|ancestor)[0-9]+_)?classes$').columns
class_cols  # to be removed

Index(['classes', 'descendant1_classes', 'descendant2_classes',
       'descendant3_classes', 'descendant4_classes', 'descendant5_classes',
       'ancestor1_classes', 'ancestor2_classes', 'ancestor3_classes',
       'ancestor4_classes', 'ancestor5_classes'],
      dtype='object')

In [None]:
def onehot_df(series, prefix=''):
    """Given a series, return a one hot encoded 
    dataframe of the classes, the nase are prefixed with the
    optional argument."""
    binarizer = LabelBinarizer()
    data = binarizer.fit_transform(series)
    return pd.SparseDataFrame(data=data, columns=[prefix + cls for cls in binarizer.classes_])

onehot_df(one_hot_pre_df['tag']).head()

In [8]:
# get the one-hot encoded dataframes
for tag_col in tag_cols:
    oh_df = onehot_df(one_hot_pre_df[tag_col].fillna('null'), prefix='{}_'.format(tag_col)) 

In [10]:
def iterate_chunks(df, chunksize=1000):
    """Iterates over a """
    yield from (g for _, g in df.groupby(np.arange(len(df)) // chunksize))

In [11]:
def get_containing_urls(df, label):
    """Returns the urls of all the pages that contain at least a
    tag with the given label."""
    grp_df = df.groupby('url')[label].any().reset_index()
    return grp_df[grp_df[label]]['url']

get_containing_urls(dataset_df, 'list_container_label').head()

1              https://www.olx.ro/animale-de-companie/
2    https://www.olx.ro/animale-de-companie/alte-an...
3    https://www.olx.ro/animale-de-companie/animale...
4    https://www.olx.ro/animale-de-companie/servici...
5                https://www.olx.ro/anunturi-agricole/
Name: url, dtype: object

In [12]:
def filter_by_urls(df, urls):
    """Returns the df, filtered by """
    return df[df['url'].isin(urls)]

In [13]:
def cont_to_csv(chunk_gen, *args, **kwargs):
    """Writes chunk by chunk to a csv file.
    Given a generator, consume it and write it to a csv file.
    All the other rguments are passed to the `to_csv` method."""
    for step, chunk in enumerate(chunk_gen):
        if step == 0:
            # honor the header option for the first chunk
            chunk.to_csv(*args, header=kwargs.pop('header', True), **kwargs)
        else:
            # no header and apend mode
            chunk.to_csv(*args, mode='a', header=False, **kwargs)

In [14]:
# because we don't have enough memory to actually load all
# of them in memory, we will have to iterate over their chunks
# chunk-concatenate the dataframes

def url_label_chunk_gen(label):
    """Given a label, yield all the 
    dataframe chunks of them concatenated and filtered 
    to only the urls containing the label."""
    chunk_iters = zip(*(iterate_chunks(df) for df in  [dataset_df] + oh_dfs))
    valid_urls = get_containing_urls(dataset_df, label)  # al the urls to use
    
    # labels to drop
    all_labels = set(dataset_df.filter(axis='columns', regex='^.*_label$').columns)
    dropped_labels = all_labels - set(label)
        
    # process them sequentially with chunks
    for chunks in chunk_iters:
        concat_chunks = filter_by_urls(pd.concat(chunks, axis='columns'), valid_urls)
        # drop the unneeded labels, rename the needed one simply to "label" and yield
        yield concat_chunks.drop(dropped_labels, axis='columns').rename({label: 'label'})

In [15]:
# write them 
chunk_gen = url_label_chunk_gen('list_container_label')
cont_to_csv(chunk_gen, '../data/processed/olx-list-container.csv', index=False)