In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")
import scripts.data_layer as data_layer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
customers = pd.read_csv('../data/recommend_1.csv')
transactions = pd.read_csv('../data/trx_data.csv')
print(customers.shape)
print(customers.head())
print(transactions.shape)
print(transactions.head())

(1000, 1)
   customerId
0        1553
1       20400
2       19750
3        6334
4       27773
(62483, 2)
   customerId                        products
0           0                              20
1           1  2|2|23|68|68|111|29|86|107|152
2           2       111|107|29|11|11|11|33|23
3           3                         164|227
4           5                             2|2


In [4]:
# example 1: split product items
transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])
transactions.head(5).set_index('customerId')['products'].apply(pd.Series).reset_index()

# example 2: organize a given table into a dataframe with customerId, single productId, and purchase count
pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})


data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

print(data.shape)
print(data.head())

(133585, 3)
   customerId  productId  purchase_count
0           0          1               2
1           0         13               1
2           0         19               3
3           0         20               1
4           0         31               2


In [5]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
df_matrix.head()

productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,2.0,,,,,,,,,...,,,,,,,,,,
1,,,6.0,,,,,,,,...,,,,1.0,,,1.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
df_matrix = df_matrix.fillna(0)
similarity_cosine = cosine_similarity(df_matrix, df_matrix)

In [7]:
print(similarity_cosine[:10])

[[1.         0.01288848 0.         ... 0.         0.         0.        ]
 [0.01288848 1.         0.11527808 ... 0.         0.         0.        ]
 [0.         0.11527808 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.04402255 0.         ... 0.         0.         0.        ]
 [0.02554695 0.         0.         ... 0.1424941  0.         0.        ]
 [0.04364358 0.35437465 0.         ... 0.         0.         0.        ]]


In [8]:
similarity_pearson = np.corrcoef(df_matrix)

In [12]:
print(similarity_pearson[:10])

[[ 1.         -0.02921086 -0.02449684 ... -0.01138872 -0.01138872
  -0.01138872]
 [-0.02921086  1.          0.09191833 ... -0.01252529 -0.01252529
  -0.01252529]
 [-0.02449684  0.09191833  1.         ... -0.0071939  -0.0071939
  -0.0071939 ]
 ...
 [-0.02813276  0.01456017 -0.01777058 ... -0.00826164 -0.00826164
  -0.00826164]
 [-0.02762186 -0.06009653 -0.03451643 ...  0.13207813 -0.01604688
  -0.01604688]
 [ 0.00934416  0.32910736 -0.02265764 ... -0.01053366 -0.01053366
  -0.01053366]]
