In [1]:
import pandas as pd
import numpy as np

import datetime 
import time

%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
events_df = pd.read_csv("F:\\1031_1872_bundle_archive\\events.csv")
category_tree_df = pd.read_csv("F:\\1031_1872_bundle_archive\\category_tree.csv")
item_properties_1_df = pd.read_csv("F:\\1031_1872_bundle_archive\\item_properties_part1.csv")
item_properties_2_df = pd.read_csv("F:\\1031_1872_bundle_archive\\item_properties_part2.csv")

In [3]:
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [4]:
item_properties_1_df.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [5]:
item_properties_1_df.loc[(item_properties_1_df.property == 'categoryid') & (item_properties_1_df.value == '1016')].sort_values('timestamp').head()

Unnamed: 0,timestamp,itemid,property,value
6363096,1431226800000,339403,categoryid,1016
8597591,1431226800000,161686,categoryid,1016
7942027,1431226800000,418837,categoryid,1016
10230975,1431226800000,85538,categoryid,1016
7280176,1431226800000,278463,categoryid,1016


In [19]:
findL=['addtocart']
replaceL=['bid']
events_df.event=events_df.event.replace(findL, replaceL)

In [7]:
events_df.event

0          view
1          view
2          view
3          view
4          view
           ... 
2756096    view
2756097    view
2756098    view
2756099    view
2756100    view
Name: event, Length: 2756101, dtype: object

In [20]:
events_df[events_df.transactionid.isnull()].event.unique()

array(['view', 'bid'], dtype=object)

In [21]:
#Let's get all the customers who bought something
customer_purchased = events_df[events_df.transactionid.notnull()].visitorid.unique()
customer_purchased.size

11719

In [22]:
#Let's get all unique visitor ids as well
all_customers = events_df.visitorid.unique()
all_customers.size

1407580

In [23]:
customer_browsed = [x for x in all_customers if x not in customer_purchased]

In [24]:
temp_array = np.isin(customer_browsed, customer_purchased)
temp_array[temp_array == False].size

1395861

In [25]:
events_df[events_df.visitorid == 102019].sort_values('timestamp')

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
19690,1433175714335,102019,view,49521,
19501,1433175801314,102019,bid,49521,
14842,1433175812596,102019,view,150318,
19573,1433175871497,102019,view,49521,
8701,1433175894837,102019,view,49521,
19708,1433175945872,102019,view,150318,
8740,1433176042269,102019,view,49521,
814,1433176736375,102019,transaction,150318,13556.0
19724,1433176736422,102019,transaction,49521,13556.0


In [26]:
# Firstly let's create an array that lists visitors who made a purchase
customer_purchased = events_df[events_df.transactionid.notnull()].visitorid.unique()
    
purchased_items = []
    
# Create another list that contains all their purchases 
for customer in customer_purchased:

    #Generate a Pandas series type object containing all the visitor's purchases and put them in the list
    purchased_items.append(list(events_df.loc[(events_df.visitorid == customer) & (events_df.transactionid.notnull())].itemid.values))   

In [28]:
#by the same customer
def recommender_bought_bought(item_id, purchased_items):
    
    # Perhaps implement a binary search for that item id in the list of arrays
    # Then put the arrays containing that item id in a new list
    # Then merge all items in that list and get rid of duplicates
    recommender_list = []
    for x in purchased_items:
        if item_id in x:
            recommender_list += x
    
    #Then merge recommender list and remove the item id
    recommender_list = list(set(recommender_list) - set([item_id]))
    
    return recommender_list

In [29]:
all_visitors = events_df.visitorid.sort_values().unique()
all_visitors.size

1407580

In [30]:
buying_visitors = events_df[events_df.event == 'transaction'].visitorid.sort_values().unique()
viewing_visitors_list = list(set(all_visitors) - set(buying_visitors))

In [31]:
def create_dataframe(visitor_list):
    
    array_for_df = []
    for index in visitor_list:

        #Create that visitor's dataframe once
        v_df = events_df[events_df.visitorid == index]

        temp = []
        #Add the visitor id
        temp.append(index)

        #Add the total number of unique products viewed
        temp.append(v_df[v_df.event == 'view'].itemid.unique().size)

        #Add the total number of views regardless of product type
        temp.append(v_df[v_df.event == 'view'].event.count())

        #Add the total number of purchases
        number_of_items_bought = v_df[v_df.event == 'transaction'].event.count()
        temp.append(number_of_items_bought)

        #Then put either a zero or one if they made a purchase
        if(number_of_items_bought == 0):
            temp.append(0)
        else:
            temp.append(1)

        array_for_df.append(temp)
    
    return pd.DataFrame(array_for_df, columns=['visitorid', 'num_items_viewed', 'view_count', 'bought_count', 'purchased'])

In [32]:
buying_visitors_df = create_dataframe(buying_visitors)

In [33]:
import random
random.shuffle(viewing_visitors_list)

In [34]:
viewing_visitors_df = create_dataframe(viewing_visitors_list[0:27820])

In [35]:
main_df = pd.concat([buying_visitors_df, viewing_visitors_df], ignore_index=True)
main_df = main_df.sample(frac=1)

In [47]:
main_df.head()

Unnamed: 0,visitorid,num_items_viewed,view_count,bought_count,purchased
8801,1064436,1,1,1,1
27045,1256800,1,1,0,0
39135,40481,1,1,0,0
11319,1359060,1,2,1,1
9463,1140795,1,3,1,1


In [36]:
X = main_df.drop(['purchased', 'visitorid', 'bought_count'], axis = 'columns')
y = main_df.purchased
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = 0.7)

In [38]:
from sklearn.svm import SVC

In [39]:
classifier= SVC(kernel='sigmoid')
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [40]:
y_pred=classifier.predict(X_test)

In [41]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[7231 1102]
 [1416 2113]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.85      8333
           1       0.66      0.60      0.63      3529

    accuracy                           0.79     11862
   macro avg       0.75      0.73      0.74     11862
weighted avg       0.78      0.79      0.78     11862



In [42]:
import joblib

In [44]:
filename = 'finalized_model.sav'
joblib.dump(classifier, filename)

['finalized_model.sav']

In [46]:
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, y_test)
print(result)

0.7877255100320351


In [49]:
test_1=[[1, 2], [4,5]]
y_pred=classifier.predict(test_1)
print(y_pred)

[0 1]


In [54]:
for i in y_pred:
    if i==0:
        print("Not Recommended")
    else:
        print("Recommended")

Not Recommended
Recommended
