In [17]:
import pandas as pd
import numpy as np

# Read and explore data

In [18]:
df = pd.read_csv('data/clickdata.csv')

In [19]:
df.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,ua_agent_class
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Robot
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Robot
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Browser
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Robot
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Robot Mobile


In [20]:
df.groupby(['ua_agent_class', 'visitor_recognition_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters
ua_agent_class,visitor_recognition_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Browser,ANONYMOUS,20390,20390,20151,13918,20390,6823
Browser,LOGGEDIN,3076,3076,3074,2556,3076,1677
Browser,RECOGNIZED,12043,12043,12038,10193,12043,5872
Browser Webview,ANONYMOUS,849,849,849,707,849,307
Browser Webview,LOGGEDIN,406,406,406,287,406,82
Browser Webview,RECOGNIZED,545,545,545,460,545,203
Cloud Application,ANONYMOUS,2,2,2,2,2,0
Hacker,ANONYMOUS,1176,1176,1176,689,1176,0
Hacker,RECOGNIZED,1,1,1,1,1,0
Mobile App,ANONYMOUS,9,9,9,4,9,6


Each row in 'df' contains a page request in a session.

CSV column definitions:
* epoch_ms: epoch in milliseconds
* session_id: session identifier
* country_by_ip_address: estimated country based on GeoIP lookup
* region_by_ip_address: estimated region based on GeoIP lookup
* url_without_parameters: 
* referrer_without_parameters: 
* visitor_recognition_type: ANONYMOUS, RECOGNIZED (by cookie) or LOGGEDIN
* ua_agent_class: the class label

In [21]:
# clean up different types of missing values
df = df.replace(np.nan, '', regex=True)
df = df.replace('Unknown', '', regex=True)

In [22]:
# lets look at some of the columns
df['visitor_recognition_type'].unique()

array(['ANONYMOUS', 'LOGGEDIN', 'RECOGNIZED'], dtype=object)

In [23]:
df['country_by_ip_address'].unique()

array(['US', 'IT', 'NL', 'BE', '', 'UA', 'FR', 'DE', 'PL', 'CN', 'IE',
       'RU', 'GB', 'AT', 'HU', 'JP', 'CA', 'PT', 'ES', 'CH', 'LT', 'ID',
       'IN', 'TR', 'IR', 'MY', 'NZ', 'AU', 'TH', 'BD', 'QA', 'CZ', 'VN',
       'MN', 'IL', 'FI', 'AM', 'DK', 'SR', 'GR', 'SE', 'LV', 'PK', 'LU',
       'MA', 'MD', 'BG', 'BR', 'HR', 'AR', 'AL', 'MK', 'GH', 'PY', 'NO',
       'RO', 'BO', 'ZA', 'SO', 'MC', 'MX', 'KR', 'DO', 'CW', 'SK', 'KG'],
      dtype=object)

In [24]:
# Interesting values are 'Robot' and 'Browser' (not a robot)
print(df['ua_agent_class'].unique())

['Robot' 'Browser' 'Robot Mobile' 'Browser Webview' 'Hacker' 'Special'
 'Mobile App' 'Cloud Application']


In [25]:
# Reduce the amount of detail in classes
# Merge all different Human types
df['ua_agent_class'] = df['ua_agent_class'].str.replace('Browser Webview','Browser')
# Merge all different 'non hunam' types
df['ua_agent_class'] = df['ua_agent_class'].str.replace('Robot Mobile','Robot')
print(df['ua_agent_class'].unique())

['Robot' 'Browser' 'Hacker' 'Special' 'Mobile App' 'Cloud Application']


# Train a model

In [26]:
import pandas as pd
# select a few columns and transform them into features
df['referrer_without_parameters'] = df['referrer_without_parameters'].str.replace('^(?![\s\S])', 'Unknown', regex=True)

filter = df["referrer_without_parameters"]=="Unknown"
print(len(df.where(filter).dropna()))
df.groupby(['referrer_without_parameters']).count()

df['referrer_without_parameters'] = df['referrer_without_parameters'].str.replace('^(?!Unknown).*$','ref', regex=True)
print(len(df['referrer_without_parameters'].unique()))
print(df['referrer_without_parameters'].unique())

X = pd.get_dummies(data=df[['country_by_ip_address', 'region_by_ip_address','url_without_parameters','referrer_without_parameters','visitor_recognition_type']], drop_first=True)
y = df['ua_agent_class']

44778
2
['Unknown' 'ref']


In [27]:
# naively split the data and train a model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, test_size=0.25, random_state=42)

In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC as SVC
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

my_classifier = RandomForestClassifier(n_estimators=100)

# pick an existing classifier algorithm
#my_classifier = RandomForestClassifier(n_estimators=100) # 0.9707614077345109 URL train size 0.4.
#my_classifier = RandomForestClassifier(n_estimators=100) # 0.9729024488157366 URL train size 0.6.
#my_classifier = RandomForestClassifier(n_estimators=100) # 0.9729024488157366 URL and REFERRER train size 0.6.
#my_classifier = RandomForestClassifier(n_estimators=50) # 0.9746420446942325 URL and REFERRER and substituting all except Unknown with ref train size 0.6.
#my_classifier = RandomForestClassifier(n_estimators=100) # 0.9747089522280209 URL and REFERRER and substituting all except Unknown with ref train size 0.6.

#my_classifier = SVC() # 0.9687541817208618
#my_classifier = SVC() # 0.9727017262143718 using URL and train size 0.6 
#my_classifier = SVC() # 0.97276863374816 using URL and REFERRER and train size 0.6
#my_classifier = SVC() # 0.9740398768901378 using URL and REFERRER and and substituting all except Unknown with ref and train size 0.6

#my_classifier = svm.SVC(kernel='rbf') #0.965
#my_classifier = svm.SVC(kernel='rbf') #using URL too slow, stoped.

#my_classifier = KNeighborsClassifier(n_jobs=-1) # 0.9682858289843437
#my_classifier = KNeighborsClassifier(n_jobs=-1) # using URL too slow, stoped.
#my_classifier = KNeighborsClassifier(n_neighbors=20, n_jobs=-1) # 0.968152013916767




In [29]:
my_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Evaluate the model

In [30]:
# model score
my_classifier.score(X_test, y_test)

0.9747089522280209

In [31]:
from sklearn.metrics import confusion_matrix
y_pred = my_classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[9306,    0,    0,   41,    0],
       [ 118,   40,    0,  170,    0],
       [   3,    0,    0,    0,    0],
       [  20,    0,    0, 5219,    0],
       [  19,    0,    0,    7,    3]], dtype=int64)

# Predict a single element

In [32]:
# predict an individual data record
y_pred = my_classifier.predict([X_test.iloc[42]])[0]
y_real = y_test.iloc[42]
print(y_pred)
print(y_real)

Robot
Robot
