# Imports

In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier, Pool

# Load data

In [13]:
data = pd.read_csv('../data/clickdata.csv')

data.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,ua_agent_class
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Robot
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Robot
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Browser
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Robot
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Robot Mobile


## Preprocess/create features

In [14]:
# Filling in missing values
data.loc[data['country_by_ip_address'].isna(), 'country_by_ip_address'] = 'UNK'
data.loc[data['region_by_ip_address'].isna(), 'region_by_ip_address'] = 'UNK'
data.loc[data['referrer_without_parameters'].isna(), 'referrer_without_parameters'] = ''

# Splitting class into class and source
data.loc[data['ua_agent_class'] == 'Browser Webview', 'ua_source'] = 'Webview'
data.loc[data['ua_agent_class'] == 'Browser Webview', 'ua_agent_class'] = 'Browser'
data.loc[data['ua_agent_class'] == 'Robot Mobile', 'ua_source'] = 'Mobile'
data.loc[data['ua_agent_class'] == 'Robot Mobile', 'ua_agent_class'] = 'Robot'

data.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,ua_agent_class,ua_source
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Robot,
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Robot,
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Browser,
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Robot,
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Robot,Mobile


## Parse URLs

In [20]:
def parse_url(url, prefix = 'https://www.bol.com/nl/'):
    def is_product_code(x):
        pass

    url_components = url.removeprefix(prefix).split('/')
    row = {
        'url_function': '',
        'category': '',
        'category_id': '',
        'category_filters': [],
        'n_category_filters': 0,
        'attribute_filters': [],
        'n_attribute_filters': 0,
        'search_type': '',
        'search_text': '',
        'search_context': '',
        'Nty': '',
        'product_id': '',
        'other': '',
        'tracking_id': ''
    }

    if url_components[0] == 'c':
        row['url_function'] = url_components[0]

        if url_components[1] == 'ajax':
            row['other'] = url_components[1]

        else:
            row['category'] = url_components[1]

            if url_components[2].isdigit():
                row['category_id'] = url_components[2]

            else:
                row['category'] = row['category'] + '/' + url_components[2]

                if url_components[3].isdigit():
                    row['category_id'] = url_components[3]

        if 'N' in url_components:
            index = url_components.index('N')
            row['category_filters'] = url_components[index + 1].split('+')

        if 'sc' in url_components:
            index = url_components.index('sc')
            row['search_context'] = url_components[index + 1]

        if 'filter_N' in url_components:
            index = url_components.index('filter_N')
            row['attribute_filters'] = url_components[index + 1].split('+')

    elif url_components[0] == 'checkout':
        row['url_function'] = url_components[0]
        row['other'] = url_components[1]

    elif url_components[0] == 'l':
        row['url_function'] = url_components[0]

        if url_components[1] == 'ajax':
            row['other'] = url_components[1]

        else:
            row['category'] = url_components[1]

        if 'N' in url_components:
            index = url_components.index('N')
            row['category_filters'] = url_components[index + 1].split('+')

        if 'filter_N' in url_components:
            index = url_components.index('filter_N')
            row['attribute_filters'] = url_components[index + 1].split('+')

    elif url_components[0] == 'order':
        row['url_function'] = url_components[0]
        row['other'] = url_components[1]

    elif url_components[0] == 'p':
        row['url_function'] = url_components[0]
        row['category'] = url_components[1]

        if url_components[2].isdigit():
            row['product_id'] = url_components[2]

    elif url_components[0] == 's':
        row['url_function'] = url_components[0]

        if url_components[1].isdigit():
            row['category_id'] = url_components[1]
        
        else:
            row['category'] = url_components[1]

        if 'N' in url_components:
            index = url_components.index('N')
            row['category_filters'] = url_components[index + 1].split('+')

        if 'Ntt' in url_components:
            index = url_components.index('Ntt')
            row['search_text'] = url_components[index + 1]

        if 'Nty' in url_components:
            index = url_components.index('Nty')
            row['Nty'] = url_components[index + 1]

        if 'sc' in url_components:
            index = url_components.index('sc')
            row['search_context'] = url_components[index + 1]

        if 'filter_N' in url_components:
            index = url_components.index('filter_N')
            row['attribute_filters'] = url_components[index + 1].split('+')

        if 'ajax' in url_components:
            row['other'] = 'ajax'

    elif url_components[0] == 'w':
        row['url_function'] = url_components[0]

        if url_components[1] == 'ajax':
            row['other'] = url_components[1]

        else:
            row['category'] = url_components[1]

            if url_components[2].isdigit():
                row['tracking_id'] = url_components[2]

            else:
                row['category'] = row['category'] + '/' + url_components[2]
                row['tracking_id'] = url_components[3]

            if 'N' in url_components:
                index = url_components.index('N')
                row['category_filters'] = url_components[index + 1].split('+')

            if 'filter_N' in url_components:
                index = url_components.index('filter_N')
                row['attribute_filters'] = url_components[index + 1].split('+')              

    row['n_category_filters'] = len(row['category_filters'])
    row['n_attribute_filters'] = len(row['attribute_filters'])

    return pd.Series(row)

In [21]:
url_features = ['url_function',
                'category',
                'category_id',
                'category_filters',
                'n_category_filters',
                'attribute_filters',
                'n_attribute_filters',
                'search_type',
                'search_text',
                'search_context',
                'Nty',
                'product_id',
                'other',
                'tracking_id']

data[url_features] = data['url_without_parameters'].apply(lambda url: parse_url(url))

data.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,ua_agent_class,ua_source,url_function,...,n_category_filters,attribute_filters,n_attribute_filters,search_type,search_text,search_context,Nty,product_id,other,tracking_id
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Robot,,l,...,3,[],0,,,,,,,
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Robot,,l,...,4,[],0,,,,,,,
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Browser,,p,...,0,[],0,,,,,9200000025533140.0,,
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Robot,,l,...,6,[4273962351],1,,,,,,,
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Robot,Mobile,l,...,3,[],0,,,,,,,


# OG model

In [13]:
X = data[['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type']]
y = data['ua_agent_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type'])

clf = CatBoostClassifier(iterations=100)

clf.fit(train_pool, verbose=False)

<catboost.core.CatBoostClassifier at 0x11678f940>

In [14]:
test_pool = Pool(data=X_test, cat_features=['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type'])
y_pred = clf.predict(test_pool)
print(classification_report(y_test, y_pred, labels=clf.classes_, zero_division=0.0))

                   precision    recall  f1-score   support

          Browser       0.98      0.99      0.99      9328
Cloud Application       0.00      0.00      0.00         1
           Hacker       0.00      0.00      0.00       294
       Mobile App       0.00      0.00      0.00         2
            Robot       0.95      1.00      0.97      5285
          Special       0.67      0.06      0.10        36

         accuracy                           0.97     14946
        macro avg       0.43      0.34      0.34     14946
     weighted avg       0.95      0.97      0.96     14946



## With url_subclass

In [17]:
X = data[['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type', 'url_function']]
y = data['ua_agent_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type', 'url_function'])

clf = CatBoostClassifier(iterations=100)

clf.fit(train_pool, verbose=False)

<catboost.core.CatBoostClassifier at 0x105136b90>

In [18]:
test_pool = Pool(data=X_test, cat_features=['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type', 'url_function'])
y_pred = clf.predict(test_pool)
print(classification_report(y_test, y_pred, labels=clf.classes_, zero_division=0.0))

                   precision    recall  f1-score   support

          Browser       0.98      0.99      0.99      9328
Cloud Application       0.00      0.00      0.00         1
           Hacker       0.59      0.11      0.18       294
       Mobile App       0.00      0.00      0.00         2
            Robot       0.96      0.99      0.98      5285
          Special       1.00      0.06      0.11        36

         accuracy                           0.97     14946
        macro avg       0.59      0.36      0.38     14946
     weighted avg       0.97      0.97      0.97     14946



# With URL components

In [22]:
features = ['country_by_ip_address', 
            'region_by_ip_address', 
            'visitor_recognition_type',
            'url_function',
            'category',
            'category_id',
            'n_category_filters',
            'n_attribute_filters',
            'search_type',
            'search_text',
            'search_context',
            'Nty',
            'product_id',
            'other',
            'tracking_id']

cat_features = ['country_by_ip_address', 
                'region_by_ip_address', 
                'visitor_recognition_type',
                'url_function',
                'category',
                'category_id',
                'search_type',
                'search_text',
                'search_context',
                'Nty',
                'product_id',
                'other',
                'tracking_id']

X = data[features]
y = data['ua_agent_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)

clf = CatBoostClassifier(iterations=100)

clf.fit(train_pool, verbose=False)

<catboost.core.CatBoostClassifier at 0x1207f6c20>

In [23]:
test_pool = Pool(data=X_test, cat_features=cat_features)
y_pred = clf.predict(test_pool)
print(classification_report(y_test, y_pred, labels=clf.classes_, zero_division=0.0))

                   precision    recall  f1-score   support

          Browser       1.00      0.99      0.99      9328
Cloud Application       0.00      0.00      0.00         1
           Hacker       0.88      0.54      0.67       294
       Mobile App       0.00      0.00      0.00         2
            Robot       0.96      0.99      0.98      5285
          Special       1.00      0.06      0.11        36

         accuracy                           0.98     14946
        macro avg       0.64      0.43      0.46     14946
     weighted avg       0.98      0.98      0.98     14946

