In [2]:
import pandas as pd
import numpy as np
import requests
import json

In [3]:
df = pd.read_csv('events.csv')
df.shape

(2756101, 5)

In [4]:
trans = df[df['event'] == 'transaction']
trans.shape

(22457, 5)

In [5]:
visitors = trans['visitorid'].unique()
items = trans['itemid'].unique()
print(visitors.shape)
print(items.shape)

(11719,)
(12025,)


In [6]:
trans2 = trans.groupby(['visitorid']).head(50)
trans2.shape

(19939, 5)

In [7]:
trans2

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
130,1433222276276,599528,transaction,356475,4000.0
304,1433193500981,121688,transaction,15335,11117.0
418,1433193915008,552148,transaction,81345,5444.0
814,1433176736375,102019,transaction,150318,13556.0
843,1433174518180,189384,transaction,310791,7244.0
...,...,...,...,...,...
2755082,1438388436295,1155978,transaction,430050,4316.0
2755285,1438380441389,218648,transaction,446271,10485.0
2755294,1438377176570,1050575,transaction,31640,8354.0
2755508,1438357730123,855941,transaction,235771,4385.0


In [8]:
trans2['visitors'] = trans2['visitorid'].apply(lambda x : np.argwhere(visitors == x)[0][0])
trans2['items'] = trans2['itemid'].apply(lambda x : np.argwhere(items == x)[0][0])
trans2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,visitors,items
130,1433222276276,599528,transaction,356475,4000.0,0,0
304,1433193500981,121688,transaction,15335,11117.0,1,1
418,1433193915008,552148,transaction,81345,5444.0,2,2
814,1433176736375,102019,transaction,150318,13556.0,3,3
843,1433174518180,189384,transaction,310791,7244.0,4,4
...,...,...,...,...,...,...,...
2755082,1438388436295,1155978,transaction,430050,4316.0,11716,6280
2755285,1438380441389,218648,transaction,446271,10485.0,3646,12024
2755294,1438377176570,1050575,transaction,31640,8354.0,11717,3246
2755508,1438357730123,855941,transaction,235771,4385.0,11718,2419


In [9]:
from scipy.sparse import csr_matrix


In [10]:
occurences = csr_matrix((visitors.shape[0], items.shape[0]), dtype='int8')
def set_occurences(visitor, item):
    occurences[visitor, item] += 1
trans2.apply(lambda row: set_occurences(row['visitors'], row['items']), axis=1)
occurences

  self._set_intXint(row, col, x.flat[0])


<11719x12025 sparse matrix of type '<class 'numpy.int8'>'
	with 18905 stored elements in Compressed Sparse Row format>

In [11]:
cooc = occurences.transpose().dot(occurences)
cooc.setdiag(0)

  self._set_arrayXarray(i, j, x)


In [12]:
def xLogX(x):
    return x * np.log(x) if x != 0 else 0.0
def entropy(x1, x2=0, x3=0, x4=0):
    return xLogX(x1 + x2 + x3 + x4) - xLogX(x1) - xLogX(x2) - xLogX(x3) - xLogX(x4)
def LLR(k11, k12, k21, k22):
    rowEntropy = entropy(k11 + k12, k21 + k22)
    columnEntropy = entropy(k11 + k21, k12 + k22)
    matrixEntropy = entropy(k11, k12, k21, k22)
    if rowEntropy + columnEntropy < matrixEntropy:
        return 0.0
    return 2.0 * (rowEntropy + columnEntropy - matrixEntropy)
def rootLLR(k11, k12, k21, k22):
    llr = LLR(k11, k12, k21, k22)
    sqrt = np.sqrt(llr)
    if k11 * 1.0 / (k11 + k12) < k21 * 1.0 / (k21 + k22):
        sqrt = -sqrt
    return sqrt

In [13]:
row_sum = np.sum(cooc, axis=0).A.flatten()
column_sum = np.sum(cooc, axis=1).A.flatten()
total = np.sum(row_sum, axis=0)
pp_score = csr_matrix((cooc.shape[0], cooc.shape[1]), dtype='double')
cx = cooc.tocoo()
for i,j,v in zip(cx.row, cx.col, cx.data):
    if v != 0:
        k11 = v
        k12 = row_sum[i] - k11
        k21 = column_sum[j] - k11
        k22 = total - k11 - k12 - k21
        pp_score[i,j] = rootLLR(k11, k12, k21, k22)

In [14]:
result = np.flip(np.sort(pp_score.A, axis=1), axis=1)
result_indices = np.flip(np.argsort(pp_score.A, axis=1), axis=1)

In [15]:
result

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [3.59861177, 3.59861177, 3.59861177, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [4.5174264 , 4.5174264 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [3.14261641, 3.14261641, 3.14261641, ..., 0.        , 0.        ,
        0.        ]])

In [16]:
result[8456]

array([15.33511076, 14.60017668,  3.62091635, ...,  0.        ,
        0.        ,  0.        ])

In [17]:
len(result_indices[8456])


12025

In [18]:
minLLR = 5
indicators = result[:, :50]
indicators[indicators < minLLR] = 0.0
indicators_indices = result_indices[:, :50]
max_indicator_indices = (indicators==0).argmax(axis=1)
max = max_indicator_indices.max()
indicators = indicators[:, :max+1]
indicators_indices = indicators_indices[:, :max+1]

In [22]:
actions = []
for i in range(indicators.shape[0]):
    length = indicators[i].nonzero()[0].shape[0]
    real_indicators = items[indicators_indices[i, :length]].astype("int").tolist()
    id = items[i]
    
    action = { "index" : { "_index" : "items2", "_id" : str(id) } }
    
    data = {
        "id": int(id),
        "indicators": real_indicators
    }

    actions.append(json.dumps(action))
    actions.append(json.dumps(data))
    
    if len(actions) == 200:
        actions_string = "\n".join(actions) + "\n"
        actions = []
        
        url = "http://127.0.0.1:9200/_bulk/"
        headers = {
            "Content-Type" : "application/x-ndjson"
        }
#         print(actions_string)
        requests.post(url, headers=headers, data=actions_string)
if len(actions) > 0:
    actions_string = "\n".join(actions) + "\n"
    actions = []
    url = "http://127.0.0.1:9200/_bulk/"
    headers = {
        "Content-Type" : "application/x-ndjson"
    }
    requests.post(url, headers=headers, data=actions_string)


In [20]:
popular = np.zeros(items.shape[0])
def inc_popular(index):
    popular[index] += 1
trans2.apply(lambda row: inc_popular(row['items']), axis=1)

130        None
304        None
418        None
814        None
843        None
           ... 
2755082    None
2755285    None
2755294    None
2755508    None
2755607    None
Length: 19939, dtype: object

In [21]:
actions = []
for i in range(indicators.shape[0]):
    length = indicators[i].nonzero()[0].shape[0]
    real_indicators = items[indicators_indices[i, :length]].astype("int").tolist()
    id = items[i]
    
    action = { "index" : { "_index" : "items3", "_id" : str(id) } }
    
#     url = "http://127.0.0.1:9200/items/_create/" + str(id)
    data = {
        "id": int(id),
        "indicators": real_indicators,
        "popular": popular[i]
    }
    
    actions.append(json.dumps(action))
    actions.append(json.dumps(data))
    
    if len(actions) == 200:
        actions_string = "\n".join(actions) + "\n"
        actions = []
        
        url = "http://127.0.0.1:9200/_bulk/"
        headers = {
            "Content-Type" : "application/x-ndjson"
        }
#         print(actions_string)
        requests.post(url, headers=headers, data=actions_string)
if len(actions) > 0:
    actions_string = "\n".join(actions) + "\n"
    actions = []
    url = "http://127.0.0.1:9200/_bulk/"
    headers = {
        "Content-Type" : "application/x-ndjson"
    }
    requests.post(url, headers=headers, data=actions_string)