In [1]:
#importing important libraries pandas numpy,pandas,seaborn,time-because timestamps are there
#sklearn for the machine learning models
#matplotlib for the plots and all
#We are using the logistic regression classifier here
import pandas as pd
import numpy as np

import datetime 
import time

%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
#Importing data sets
#We are using four data sets here

In [3]:
events_df = pd.read_csv('data/events.csv')
category_tree_df = pd.read_csv('data/category_tree.csv')
item_properties_1_df = pd.read_csv('data/item_properties_part1.csv')
item_properties_2_df = pd.read_csv('data/item_properties_part2.csv')

In [4]:
#So we are working on the 4 datasets

In [18]:
#this is the first dataset we are using in which there are five features
#1)The timestamp portion is in Unix Epoch format e.g. 1433221332117
#2)Visitor Id is the unique user currently browsing the website
#3)Event is what the user is currently doing in that current timestamp
#4)Transaction ID will only have value if the user made a purchase as shown below

#viewing of the database;
print("the shape of the dataset is",events_df.shape)
events_df.head(100)


the shape of the dataset is (2756101, 5)


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
95,1433221135245,320957,view,102131,
96,1433223900862,132736,view,67627,
97,1433224352539,282587,view,78037,
98,1433222949983,901571,view,458588,


In [6]:
#getting some more insights about the dataset
events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
timestamp        int64
visitorid        int64
event            object
itemid           int64
transactionid    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


In [7]:
#see two types of data types are there mainly object and the int.

In [8]:
#seeing the dataset2
item_properties_2_df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1433041200000,183478,561,769062
1,1439694000000,132256,976,n26.400 1135780
2,1435460400000,420307,921,1149317 1257525
3,1431831600000,403324,917,1204143
4,1435460400000,230701,521,769062


In [9]:
#seeing the memory of the dataset is also very necessary as to see how much memory your dataset is taking.
item_properties_2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9275903 entries, 0 to 9275902
Data columns (total 4 columns):
timestamp    int64
itemid       int64
property     object
value        object
dtypes: int64(2), object(2)
memory usage: 283.1+ MB


In [19]:
#seeing category of the event when transaction is not null
#see there are very less people who actually buy the product 
#means most of the people just see the product but donot actually purchase it.
events_df[events_df.transactionid.notnull()].event.unique

<bound method Series.unique of 130        transaction
304        transaction
418        transaction
814        transaction
843        transaction
              ...     
2755294    transaction
2755349    transaction
2755508    transaction
2755603    transaction
2755607    transaction
Name: event, Length: 22457, dtype: object>

In [20]:
print(events_df[events_df.transactionid.notnull()].event.unique())

['transaction']


In [6]:
#seeing category of the event when transaction is null
#As you see there are two categories
#event happens when people just view or addtocart but donot actually buy.
events_df[events_df.transactionid.isnull()].event.unique()

array(['view', 'addtocart'], dtype=object)

In [21]:
#seeing the item_properties having the item_id ,values and other things 
item_properties_1_df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [22]:
#category_tree_df contains the item id and its parent id
category_tree_df.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [9]:
#Category IDs explain the relationship of different products with each other 
#e.g. Category ID 1016 is a child of Parent ID 213.
#Below shows the number of items under category id 1016
item_properties_1_df.loc[(item_properties_1_df.property == 'categoryid') & (item_properties_1_df.value == '1016')].sort_values('timestamp').head()

Unnamed: 0,timestamp,itemid,property,value
6363096,1431226800000,339403,categoryid,1016
8597591,1431226800000,161686,categoryid,1016
7942027,1431226800000,418837,categoryid,1016
10230975,1431226800000,85538,categoryid,1016
7280176,1431226800000,278463,categoryid,1016


In [23]:
#Customer Behaviour Exploration
#we categorise the customers into two categories that is that is who purchased something or who purchased nothing.
#first we calculate who purchased things

customer_purchased=events_df[events_df.transactionid.notnull()].visitorid.unique()

print("The Number of Customers who Purchased Something",customer_purchased.size)

print(customer_purchased)#customer ids who purchased something and it is an array

The Number of Customers who Purchased Something 11719
[ 599528  121688  552148 ... 1155978 1050575  855941]


In [12]:
#In this we are checking total how many customers are there in total

#Assumption

#Since we have no information whether there were any repeat users who bought something from the site,

#I'll just have to assume for now that the 11,719 visitors are unique and made at least a single purchase

all_customers=events_df.visitorid.unique()

all_customers.size

1407580

In [13]:
#these are the customers who donot purchase anything just browsed things or either view or add to cart
customer_browsed=[x for x in all_customers if x not in customer_purchased]
len(customer_browsed)

1395861

In [87]:
#using Numpy Array
#Previous work can also be possible using the Numpy array 
temp_array=np.isin(customer_browsed,customer_purchased)
temp_array[temp_array == False].size
#temp_array

1395861

In [34]:
#It is the journey of the visitorid 102019 as it starts from viewing the product to buying the product

events_df[events_df.visitorid == 102019].sort_values('timestamp')

#See how it starts from viewing then add to  cart then again viewing and at the end the customer made the tranaction
#This behavioral analysis is also the current work for recommender system because it is also possible to see the general trends in customer behaviour;

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
19690,1433175714335,102019,view,49521,
19501,1433175801314,102019,addtocart,49521,
14842,1433175812596,102019,view,150318,
19573,1433175871497,102019,view,49521,
8701,1433175894837,102019,view,49521,
19708,1433175945872,102019,view,150318,
8740,1433176042269,102019,view,49521,
814,1433176736375,102019,transaction,150318,13556.0
19724,1433176736422,102019,transaction,49521,13556.0


In [35]:
#the timestamp available here is in the unix epoch format
#we can change it in the human readable format

In [18]:
#Changing the unix time format to the datetime format
#within specifies format
tz = int('1433221332')
new_time = datetime.datetime.fromtimestamp(tz)
new_time.strftime('%Y-%m-%d %H:%M:%S')

'2015-06-02 10:32:12'

In [36]:
#It is just an example
tz = int('1438400163')
new_time = datetime.datetime.fromtimestamp(tz)
new_time.strftime('%Y-%m-%d %H:%M:%S')

'2015-08-01 09:06:03'

In [37]:
#What insights can we offer the visitor to guide them in their buying journey
#perhaps we can offer them a list of what previous visitors bought together with the item they are currently viewing
#It is nothing but the frequently bought together
customer_purchased = events_df[events_df.transactionid.notnull()].visitorid.unique()

purchased_items=[]

for customer in customer_purchased:
    purchased_items.append(list(events_df.loc[(events_df.visitorid == customer) & (events_df.transactionid.notnull())].itemid.values))
    

In [25]:
purchased_items

[[356475],
 [15335,
  380775,
  237753,
  317178,
  12836,
  400969,
  105792,
  25353,
  200793,
  80582,
  302422],
 [81345],
 [150318, 49521],
 [310791, 299044],
 [54058,
  284871,
  251130,
  268335,
  183049,
  261940,
  369093,
  370745,
  192990,
  277119,
  241716,
  283766,
  16417,
  217068,
  36372,
  68923,
  428015,
  69533,
  13520,
  385638,
  442871,
  136526,
  247862,
  93828,
  230911,
  382595,
  34853,
  216260,
  154812,
  445241,
  57702,
  347850,
  151855,
  226327,
  288525,
  51354,
  345994,
  170438,
  254301,
  266439,
  193718,
  388558,
  26745,
  184086,
  79956,
  252040,
  82232,
  309821,
  394518,
  462070,
  331980,
  353111,
  200527,
  235933,
  68532,
  358882,
  60012,
  29741,
  270487,
  163689,
  6913,
  156457,
  341578,
  163352,
  234493,
  135174,
  452481,
  241755,
  56323,
  210137,
  184397,
  285202,
  198690,
  195958,
  239210,
  71640,
  189108,
  369112,
  346186,
  211207,
  134330,
  257070,
  302239,
  459480,
  57577,
  1952

In [26]:
# Write a function that would show items that were bought together (same of different dates) by the same customer
def recommender_bought_bought(item_id, purchased_items):
    
    # Perhaps implement a binary search for that item id in the list of arrays
    # Then put the arrays containing that item id in a new list
    # Then merge all items in that list and get rid of duplicates
    
    recommender_list = []
    for x in purchased_items:
        if item_id in x:
            recommender_list += x
    
    #Then merge recommender list and remove the item id
    recommender_list = list(set(recommender_list) - set([item_id]))
    
    return recommender_list


In [27]:
#So now we can present to the visitor a list of the other items a customer previously bought along with 
#what item the current visitor is viewing e.g. item number 302422


recommender_bought_bought(302422, purchased_items)

[105792, 200793, 12836, 80582, 380775, 15335, 400969, 25353, 237753, 317178]

In [28]:
#What other insights can we gather from the items that were viewed, added to cart and sold
#total visitors on the website
all_visitors = events_df.visitorid.sort_values().unique()
all_visitors.size

1407580

In [29]:
#Buying Visitors
buying_visitors = events_df[events_df.event == 'transaction'].visitorid.sort_values().unique()
buying_visitors.size

11719

In [37]:
#Viewing Visitors
viewing_visitors_list = list(set(all_visitors) - set(buying_visitors))
viewing_visitors_list

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,


In [32]:
#Now lets create a function that creates a dataframe with new features: visitorid, number of items viewed, 
#total viewcount, bought something or not
def create_dataframe(visitor_list):
    
    array_for_df = []
    for index in visitor_list:

        #Create that visitor's dataframe once
        v_df = events_df[events_df.visitorid == index]

        temp = []
        #Add the visitor id
        temp.append(index)

        #Add the total number of unique products viewed
        temp.append(v_df[v_df.event == 'view'].itemid.unique().size)

        #Add the total number of views regardless of product type
        temp.append(v_df[v_df.event == 'view'].event.count())

        #Add the total number of purchases
        number_of_items_bought = v_df[v_df.event == 'transaction'].event.count()
        temp.append(number_of_items_bought)

        #Then put either a zero or one if they made a purchase
        if(number_of_items_bought == 0):
            temp.append(0)
        else:
            temp.append(1)

        array_for_df.append(temp)
    
    return pd.DataFrame(array_for_df, columns=['visitorid', 'num_items_viewed', 'view_count', 'bought_count', 'purchased'])

In [33]:
#Let's apply this to buying visitors first
buying_visitors_df=create_dataframe(buying_visitors)

In [88]:
#new Data frame structure
#buying_visitors_df.shape(dimension of the data)
buying_visitors_df.head()

Unnamed: 0,visitorid,num_items_viewed,view_count,bought_count,purchased
0,172,22,33,2,1
1,186,1,2,1,1
2,264,2,3,2,1
3,419,3,4,1,1
4,539,1,4,1,1


In [89]:
#randomly shuffling the list using the random.shuffle method
import random
random.shuffle(viewing_visitors_list)

In [40]:
#creating dataframe using previously defined function
viewing_visitors_df = create_dataframe(viewing_visitors_list[0:27820])

In [42]:
#shape of the data
viewing_visitors_df.shape

(27820, 5)

In [57]:
#Now let's combine the two dataframes
#that is buying visitors and viewing visitors
main_df = pd.concat([buying_visitors_df, viewing_visitors_df], ignore_index=True)

In [48]:
#Let's shuffle main_df first
#again shuufling the data using sample method
main_df = main_df.sample(frac=1)

In [64]:
#let's try a simple Logistic Regression model to predict future visitor purchase behaviourÂ¶
X = main_df.drop(['purchased', 'visitorid', 'bought_count'], axis = 'columns')
y = main_df.purchased

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = 0.7)

In [66]:
X_train.head()

Unnamed: 0,num_items_viewed,view_count
6637,10,11
23730,1,1
15486,1,1
35014,1,2
32992,1,1


In [67]:
y_train.head()

6637     1
23730    0
15486    0
35014    0
32992    0
Name: purchased, dtype: int64

In [68]:
X

Unnamed: 0,num_items_viewed,view_count
0,22,33
1,1,2
2,2,3
3,3,4
4,1,4
...,...,...
39534,1,1
39535,1,1
39536,1,2
39537,2,2


In [69]:
main_df.head()

Unnamed: 0,visitorid,num_items_viewed,view_count,bought_count,purchased
0,172,22,33,2,1
1,186,1,2,1,1
2,264,2,3,2,1
3,419,3,4,1,1
4,539,1,4,1,1


In [91]:
#having feature that is customer purchased it or not
y_test

25423    0
7672     1
33858    0
8156     1
5179     1
        ..
32244    0
1302     1
38271    0
4964     1
29786    0
Name: purchased, Length: 11862, dtype: int64

In [70]:
#Making of the classifier
logreg = LogisticRegression()

In [None]:
logreg.fit(X_train, y_train)

In [72]:
#predicting the data on build model on X-test
y_pred_class = logreg.predict(X_test)

In [73]:
#print the accuracy
print('accuracy = {:7.4f}'.format(metrics.accuracy_score(y_test, y_pred_class)))

accuracy =  0.7926


In [74]:
#So our model's accuracy in predicting buying visitors is around 79.46%