In [1]:
#importing packages
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

import copy
import itertools
from collections import defaultdict
from operator import itemgetter

In [2]:
#Data load and cleaning
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ')
clicks = pd.read_csv('./data/yoochoose-clicks.dat',
    delimiter=',',
    header=None,
    names=['session_id', 'datetime', 'item_id', 'category'],
    dtype={'session_id':int, 'item_id':int, 'category':str},
    parse_dates=['datetime'],
    nrows=100000,
    date_parser=dateparse)
buys = pd.read_csv('./data/yoochoose-buys.dat',
    delimiter=',',
    header=None,
    names=['session_id', 'datetime', 'item_id', 'price', 'quantity'],
    dtype={'session_id':int, 'item_id':int},
    parse_dates=['datetime'],
    nrows=100000,
    date_parser=dateparse)

In [3]:
clicks['category_clean'] = clicks.category
clicks.category_clean[clicks.category_clean.apply(len) > 2] = 'B'
clicks_category_counts = clicks[~clicks.category_clean.isin(['0', 'S', 'B'])]\
    .groupby(['item_id','category_clean'])\
    .category\
    .count()\
    .reset_index()
clicks_category_map = clicks_category_counts[clicks_category_counts.groupby('item_id')['category']\
    .transform(max) == clicks_category_counts['category']]
clicks_category_map = clicks_category_map.groupby('item_id').category_clean.first()
clicks.loc[(clicks.category_clean == '0') & (clicks.item_id.isin(clicks_category_map.index)), ['category_clean']] =\
    clicks[(clicks.category_clean == '0') & (clicks.item_id.isin(clicks_category_map.index))]\
    .item_id.apply(lambda x: clicks_category_map[x])
clicks.sort_values(['session_id', 'datetime'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
clicks.head(5)

Unnamed: 0,session_id,datetime,item_id,category,category_clean
0,1,2014-04-07 10:51:09.277,214536502,0,0
1,1,2014-04-07 10:54:09.868,214536500,0,0
2,1,2014-04-07 10:54:46.998,214536506,0,0
3,1,2014-04-07 10:57:00.306,214577561,0,0
4,2,2014-04-07 13:56:37.614,214662742,0,0


In [4]:
sessionFreq = clicks["session_id"].value_counts()
clicks["sessionFreq"] = clicks["session_id"].map(lambda x: sessionFreq[x])
clicks["didbuy"] = clicks["session_id"].isin(buys["session_id"]).astype(int)

In [5]:
clicks_buy = clicks[clicks['didbuy']==1].groupby('session_id')['item_id'].apply(list)

In [6]:
clicks_nobuy = clicks[clicks['didbuy']==0].groupby('session_id')['item_id'].apply(list)

In [13]:
from apyori import apriori
buyrules = apriori(clicks_buy, min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 2)
nobuyrules = apriori(clicks_nobuy, min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 2)
# Visualising the results
buyresult = list(buyrules)
nobuyresult = list(nobuyrules)

In [14]:
buyresult

[RelationRecord(items=frozenset({214558192, 214827005}), support=0.0030084235860409147, ordered_statistics=[OrderedStatistic(items_base=frozenset({214558192}), items_add=frozenset({214827005}), confidence=0.7142857142857144, lift=79.14285714285715), OrderedStatistic(items_base=frozenset({214827005}), items_add=frozenset({214558192}), confidence=0.33333333333333337, lift=79.14285714285715)]),
 RelationRecord(items=frozenset({214558192, 214828987}), support=0.0036101083032490976, ordered_statistics=[OrderedStatistic(items_base=frozenset({214558192}), items_add=frozenset({214828987}), confidence=0.8571428571428572, lift=79.14285714285715), OrderedStatistic(items_base=frozenset({214828987}), items_add=frozenset({214558192}), confidence=0.33333333333333337, lift=79.14285714285715)]),
 RelationRecord(items=frozenset({214840762, 214586983}), support=0.0048134777376654635, ordered_statistics=[OrderedStatistic(items_base=frozenset({214586983}), items_add=frozenset({214840762}), confidence=0.888

In [15]:
nobuyresult

[RelationRecord(items=frozenset({214684513, 214839373}), support=0.005035971223021582, ordered_statistics=[OrderedStatistic(items_base=frozenset({214684513}), items_add=frozenset({214839373}), confidence=0.27293577981651373, lift=28.792287844036693), OrderedStatistic(items_base=frozenset({214839373}), items_add=frozenset({214684513}), confidence=0.53125, lift=28.792287844036696)]),
 RelationRecord(items=frozenset({214716928, 214717003}), support=0.005374523910283538, ordered_statistics=[OrderedStatistic(items_base=frozenset({214716928}), items_add=frozenset({214717003}), confidence=0.5594713656387664, lift=29.842682550889506), OrderedStatistic(items_base=frozenset({214717003}), items_add=frozenset({214716928}), confidence=0.2866817155756208, lift=29.84268255088951)]),
 RelationRecord(items=frozenset({214716928, 214717007}), support=0.00385103681760474, ordered_statistics=[OrderedStatistic(items_base=frozenset({214716928}), items_add=frozenset({214717007}), confidence=0.4008810572687225