# Imports

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier, Pool

# Load data

In [3]:
data = pd.read_csv('../data/clickdata.csv')

data.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,ua_agent_class
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Robot
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Robot
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Browser
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Robot
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Robot Mobile


## Preprocess/create features

In [4]:
# Filling in missing values
data.loc[data['country_by_ip_address'].isna(), 'country_by_ip_address'] = 'UNK'
data.loc[data['region_by_ip_address'].isna(), 'region_by_ip_address'] = 'UNK'
data.loc[data['referrer_without_parameters'].isna(), 'referrer_without_parameters'] = ''

# Splitting class into class and source
data.loc[data['ua_agent_class'] == 'Browser Webview', 'ua_source'] = 'Webview'
data.loc[data['ua_agent_class'] == 'Browser Webview', 'ua_agent_class'] = 'Browser'
data.loc[data['ua_agent_class'] == 'Robot Mobile', 'ua_source'] = 'Mobile'
data.loc[data['ua_agent_class'] == 'Robot Mobile', 'ua_agent_class'] = 'Robot'

# Getting a url sub class. This is a placeholder for the detailed url breakdown
data['url_subclass'] = data['url_without_parameters'].apply(lambda url: url.removeprefix('https://www.bol.com/nl/').split('/')[0])

data.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,ua_agent_class,ua_source,url_subclass
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Robot,,l
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Robot,,l
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Browser,,p
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Robot,,l
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Robot,Mobile,l


## Parse URLs

In [5]:
def parse_url(url, prefix = 'https://www.bol.com/nl/'):
    def is_product_code(x):
        pass

    url_components = url.removeprefix(prefix).split('/')
    row = {
        'url_function': '',
        'category': '',
        'category_code': '',
        'product_code': '',
        'filters': [],
        'sub_filters': [],
        'other': ''
    }

    index = 0

    if url_components[index] == 'c':
        row['url_function'] = url_components[index]
        index += 1

        if url_components[index] == 'ajax':
            row['other'] = url_components[index]
            index += 1

            if url_components[index] == 'index.html':
                row['other'] += url_components[index]

        else:
            row['category'] = url_components[index]
            index += 1

            while(index < len(url_components)):
                if url_components[index].isdigit():
                    row['category_code'] = url_components[index]
                    index += 1
                    break
                else:
                    row['category'] = row['category'] + '/' + url_components[index]
                    index += 1

    elif url_components[index] == 'checkout':
        row['url_function'] = url_components[index]
        index += 1
        
        row['other'] = url_components[index]

    elif url_components[index] == 'order':
        row['url_function'] = url_components[index]
        index += 1
        
        row['other'] = url_components[index]

    elif url_components[0] == 'p':
        row['url_function'] = url_components[0]
        row['category'] = url_components[1]
        row['product_code'] = url_components[2]

    elif url_components[0] == 'l':
        row['url_function'] = url_components[0]
        row['category'] = url_components[1]

        i = 2
        if len(url_components) >= 4:
            if url_components[2] == 'N':
                row['filters'] = url_components[3].split('+')

                if len(url_components) >= 6:
                    if url_components[4] == 'filter_N':
                        row['sub_filters'] = url_components[5].split('+')

        elif url_components[1] == 'ajax':
            row['other'] += url_components[1]

            if url_components[2] == 'index.html':
                row['other'] += url_components[2]

    return pd.Series(row)

In [6]:
url_features = ['url_function', 'category', 'category_code', 'product_code', 'filters', 'sub_filters', 'other']
data[url_features] = data['url_without_parameters'].apply(lambda url: parse_url(url))

data.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,ua_agent_class,ua_source,url_subclass,url_function,category,category_code,product_code,filters,sub_filters,other
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Robot,,l,l,nederlandstalige-kunstgeschiedenis-theorie-ove...,,,"[2324, 24596, 8293]",[],
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Robot,,l,l,italiaans-natuur-wetenschap-kinderboeken-over-...,,,"[24435, 26558, 24670, 7419]",[],
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Browser,,p,p,nespresso-magimix-inissia-m105-koffiemachine-grey,,9200000025533140.0,[],[],
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Robot,,l,l,nieuwe-engelstalige-outdoor-sportboeken-algeme...,,,"[2806, 4292595594, 4288403245, 4288398652, 140...",[4273962351],
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Robot,Mobile,l,l,nieuwe-actie-avontuur-over-prive-detective,,,"[33590, 26931, 7289]",[],


# OG model

In [13]:
X = data[['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type']]
y = data['ua_agent_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type'])

clf = CatBoostClassifier(iterations=100)

clf.fit(train_pool, verbose=False)

<catboost.core.CatBoostClassifier at 0x11678f940>

In [14]:
test_pool = Pool(data=X_test, cat_features=['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type'])
y_pred = clf.predict(test_pool)
print(classification_report(y_test, y_pred, labels=clf.classes_, zero_division=0.0))

                   precision    recall  f1-score   support

          Browser       0.98      0.99      0.99      9328
Cloud Application       0.00      0.00      0.00         1
           Hacker       0.00      0.00      0.00       294
       Mobile App       0.00      0.00      0.00         2
            Robot       0.95      1.00      0.97      5285
          Special       0.67      0.06      0.10        36

         accuracy                           0.97     14946
        macro avg       0.43      0.34      0.34     14946
     weighted avg       0.95      0.97      0.96     14946



## With url_subclass

In [15]:
X = data[['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type', 'url_subclass']]
y = data['ua_agent_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type', 'url_subclass'])

clf = CatBoostClassifier(iterations=100)

clf.fit(train_pool, verbose=False)

<catboost.core.CatBoostClassifier at 0x116a7aaa0>

In [16]:
test_pool = Pool(data=X_test, cat_features=['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type', 'url_subclass'])
y_pred = clf.predict(test_pool)
print(classification_report(y_test, y_pred, labels=clf.classes_, zero_division=0.0))

                   precision    recall  f1-score   support

          Browser       0.98      0.99      0.99      9328
Cloud Application       0.00      0.00      0.00         1
           Hacker       0.59      0.04      0.08       294
       Mobile App       0.00      0.00      0.00         2
            Robot       0.95      1.00      0.97      5285
          Special       1.00      0.06      0.11        36

         accuracy                           0.97     14946
        macro avg       0.59      0.35      0.36     14946
     weighted avg       0.97      0.97      0.96     14946



# With URL components

In [8]:
features = ['country_by_ip_address', 
            'region_by_ip_address', 
            'visitor_recognition_type', 
            'url_function', 
            'category', 
            'category_code', 
            'product_code', 
            'other']
X = data[features]
y = data['ua_agent_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

train_pool = Pool(data=X_train, label=y_train, cat_features=features)

clf = CatBoostClassifier(iterations=100)

clf.fit(train_pool, verbose=False)

<catboost.core.CatBoostClassifier at 0x11e229690>

In [9]:
test_pool = Pool(data=X_test, cat_features=features)
y_pred = clf.predict(test_pool)
print(classification_report(y_test, y_pred, labels=clf.classes_, zero_division=0.0))

                   precision    recall  f1-score   support

          Browser       1.00      0.99      0.99      9328
Cloud Application       0.00      0.00      0.00         1
           Hacker       0.90      0.47      0.62       294
       Mobile App       0.00      0.00      0.00         2
            Robot       0.96      1.00      0.98      5285
          Special       1.00      0.06      0.11        36

         accuracy                           0.98     14946
        macro avg       0.64      0.42      0.45     14946
     weighted avg       0.98      0.98      0.98     14946

