In [1]:
import pandas as pd
import numpy as np

# Read and explore data

In [3]:
df = pd.read_csv('data/clickdata.csv')

In [4]:
df.head()

Unnamed: 0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters,visitor_recognition_type,ua_agent_class
0,1520280001034,be73c8d1b836170a21529a1b23140f8e,US,CA,https://www.bol.com/nl/l/nederlandstalige-kuns...,,ANONYMOUS,Robot
1,1520280001590,c24c6637ed7dcbe19ad64056184212a7,US,CA,https://www.bol.com/nl/l/italiaans-natuur-wete...,,ANONYMOUS,Robot
2,1520280002397,ee391655f5680a7bfae0019450aed396,IT,LI,https://www.bol.com/nl/p/nespresso-magimix-ini...,https://www.bol.com/nl/p/nespresso-magimix-ini...,ANONYMOUS,Browser
3,1520280002598,f8c8a696dd37ca88233b2df096afa97f,US,CA,https://www.bol.com/nl/l/nieuwe-engelstalige-o...,,ANONYMOUS,Robot
4,1520280004428,f8b0c06747b7dd1d53c0932306bd04d6,US,CA,https://www.bol.com/nl/l/nieuwe-actie-avontuur...,,ANONYMOUS,Robot Mobile


In [5]:
df.groupby(['ua_agent_class', 'visitor_recognition_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,epoch_ms,session_id,country_by_ip_address,region_by_ip_address,url_without_parameters,referrer_without_parameters
ua_agent_class,visitor_recognition_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Browser,ANONYMOUS,20390,20390,20151,13918,20390,6823
Browser,LOGGEDIN,3076,3076,3074,2556,3076,1677
Browser,RECOGNIZED,12043,12043,12038,10193,12043,5872
Browser Webview,ANONYMOUS,849,849,849,707,849,307
Browser Webview,LOGGEDIN,406,406,406,287,406,82
Browser Webview,RECOGNIZED,545,545,545,460,545,203
Cloud Application,ANONYMOUS,2,2,2,2,2,0
Hacker,ANONYMOUS,1176,1176,1176,689,1176,0
Hacker,RECOGNIZED,1,1,1,1,1,0
Mobile App,ANONYMOUS,9,9,9,4,9,6


Each row in 'df' contains a page request in a session.

CSV column definitions:
* epoch_ms: epoch in milliseconds
* session_id: session identifier
* country_by_ip_address: estimated country based on GeoIP lookup
* region_by_ip_address: estimated region based on GeoIP lookup
* url_without_parameters: 
* referrer_without_parameters: 
* visitor_recognition_type: ANONYMOUS, RECOGNIZED (by cookie) or LOGGEDIN
* ua_agent_class: the class label

In [6]:
# clean up different types of missing values
df = df.replace(np.nan, '', regex=True)
df = df.replace('Unknown', '', regex=True)

In [7]:
# lets look at some of the columns
df['visitor_recognition_type'].unique()

array(['ANONYMOUS', 'LOGGEDIN', 'RECOGNIZED'], dtype=object)

In [8]:
df['country_by_ip_address'].unique()

array(['US', 'IT', 'NL', 'BE', '', 'UA', 'FR', 'DE', 'PL', 'CN', 'IE',
       'RU', 'GB', 'AT', 'HU', 'JP', 'CA', 'PT', 'ES', 'CH', 'LT', 'ID',
       'IN', 'TR', 'IR', 'MY', 'NZ', 'AU', 'TH', 'BD', 'QA', 'CZ', 'VN',
       'MN', 'IL', 'FI', 'AM', 'DK', 'SR', 'GR', 'SE', 'LV', 'PK', 'LU',
       'MA', 'MD', 'BG', 'BR', 'HR', 'AR', 'AL', 'MK', 'GH', 'PY', 'NO',
       'RO', 'BO', 'ZA', 'SO', 'MC', 'MX', 'KR', 'DO', 'CW', 'SK', 'KG'],
      dtype=object)

In [9]:
# Interesting values are 'Robot' and 'Browser' (not a robot)
print(df['ua_agent_class'].unique())

['Robot' 'Browser' 'Robot Mobile' 'Browser Webview' 'Hacker' 'Special'
 'Mobile App' 'Cloud Application']


In [10]:
# Reduce the amount of detail in classes
# Merge all different Human types
df['ua_agent_class'] = df['ua_agent_class'].str.replace('Browser Webview','Browser')
# Merge all different 'non hunam' types
df['ua_agent_class'] = df['ua_agent_class'].str.replace('Robot Mobile','Robot')
print(df['ua_agent_class'].unique())

['Robot' 'Browser' 'Hacker' 'Special' 'Mobile App' 'Cloud Application']


# Train a model

In [11]:
import pandas as pd
# select a few columns and transform them into features
X = pd.get_dummies(data=df[['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type']], drop_first=True)
y = df['ua_agent_class']

In [12]:
# naively split the data and train a model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
# pick an existing classifier algorithm
from sklearn.neighbors import KNeighborsClassifier
my_classifier = KNeighborsClassifier(n_jobs=-1)

In [14]:
my_classifier.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=-1)

# Evaluate the model

In [None]:
# model score
my_classifier.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = my_classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

# Predict a single element

In [None]:
# predict an individual data record
y_pred = my_classifier.predict([X_test.iloc[42]])[0]
y_real = y_test.iloc[42]
print(y_pred)
print(y_real)