In [219]:
# import required libraries
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from heapq import nlargest
from sklearn.metrics import mean_squared_error
from math import sqrt
import os.path
import scipy
import scipy.stats as st
import json
from scipy import sparse

## Import Data

In [152]:
with open('training_mixpanel.txt') as f:
    data = json.load(f)
custDF = json_normalize(data)
custDF

Unnamed: 0,event,properties.country,properties.customer_id,properties.description,properties.invoice_date,properties.invoice_no,properties.product_id,properties.quantity,properties.unit_price
0,Purchased Product,United Kingdom,17850,WHITE HANGING HEART T-LIGHT HOLDER,12/1/2010 8:26,536365,85123A,6,2.55
1,Purchased Product,United Kingdom,17850,WHITE METAL LANTERN,12/1/2010 8:26,536365,71053,6,3.39
2,Purchased Product,United Kingdom,17850,SET 7 BABUSHKA NESTING BOXES,12/1/2010 8:26,536365,22752,2,7.65
3,Purchased Product,United Kingdom,17850,GLASS STAR FROSTED T-LIGHT HOLDER,12/1/2010 8:26,536365,21730,6,4.25
4,Purchased Product,United Kingdom,17850,HAND WARMER UNION JACK,12/1/2010 8:28,536366,22633,6,1.85
5,Purchased Product,United Kingdom,17850,HAND WARMER RED POLKA DOT,12/1/2010 8:28,536366,22632,6,1.85
6,Purchased Product,United Kingdom,13047,ASSORTED COLOUR BIRD ORNAMENT,12/1/2010 8:34,536367,84879,32,1.69
7,Purchased Product,United Kingdom,13047,POPPY'S PLAYHOUSE BEDROOM,12/1/2010 8:34,536367,22745,6,2.10
8,Purchased Product,United Kingdom,13047,POPPY'S PLAYHOUSE KITCHEN,12/1/2010 8:34,536367,22748,6,2.10
9,Purchased Product,United Kingdom,13047,FELTCRAFT PRINCESS CHARLOTTE DOLL,12/1/2010 8:34,536367,22749,8,3.75


In [153]:
numItems = len(custDF['properties.product_id'].unique())
numCustomers = len(custDF['properties.customer_id'].unique())

print("Number of users:", numCustomers)
print("Number of items:", numItems)

Number of users: 4363
Number of items: 3677


## Data Exploration

In [165]:
custDF.describe()

Unnamed: 0,properties.customer_id,properties.quantity,properties.unit_price
count,315955.0,315955.0,315955.0
mean,15292.471308,13.161225,2.87611
std,1712.436354,201.343614,4.399119
min,12346.0,1.0,0.0
25%,13969.0,2.0,1.25
50%,15157.0,6.0,1.95
75%,16794.0,12.0,3.75
max,18287.0,80995.0,649.5


Upon initial exploration, some quantity values are negative (could be indicative of an item being returned) and should be excluded from the dataset as we are only interested in items customers have purchased.

In [166]:
custDF = custDF[custDF['properties.quantity'] > 0]

Verify the dataset only contains records where the quantity purchased is greater than 0.

In [167]:
custDF.describe()

Unnamed: 0,properties.customer_id,properties.quantity,properties.unit_price
count,315955.0,315955.0,315955.0
mean,15292.471308,13.161225,2.87611
std,1712.436354,201.343614,4.399119
min,12346.0,1.0,0.0
25%,13969.0,2.0,1.25
50%,15157.0,6.0,1.95
75%,16794.0,12.0,3.75
max,18287.0,80995.0,649.5


Check for nulls

In [168]:
custDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 315955 entries, 0 to 322874
Data columns (total 9 columns):
event                      315955 non-null object
properties.country         315955 non-null object
properties.customer_id     315955 non-null int64
properties.description     315955 non-null object
properties.invoice_date    315955 non-null object
properties.invoice_no      315955 non-null object
properties.product_id      315955 non-null object
properties.quantity        315955 non-null int64
properties.unit_price      315955 non-null float64
dtypes: float64(1), int64(2), object(6)
memory usage: 24.1+ MB


In [169]:
popItems = custDF.groupby('properties.product_id')['properties.quantity'].sum().sort_values(ascending=False)
popItems[0:75].plot(kind='bar', grid=True, rot=270, figsize=(15,15), title='Most Purchased Items')

ModuleNotFoundError: No module named 'matplotlib'

## Item-Item Collaborative Filtering

In [299]:
popDF = pd.DataFrame(0,columns=custDF['properties.product_id'].unique(), index=custDF['properties.customer_id'].unique())
popDF

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
17850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13047,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16098,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18074,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [300]:
index = list(zip(list(custDF['properties.customer_id']), list(custDF['properties.product_id'])))
for item in index:
    popDF.at[item[0], item[1]] = 1
popDF

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
17850,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13047,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
12583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15291,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15311,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16098,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18074,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [301]:
train, test = train_test_split(popDF, test_size=0.2)

In [302]:
test

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
14142,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15134,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17732,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17828,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16686,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
14682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [278]:
norm = np.sqrt(np.square(popDF).sum(axis=1))
popDF = popDF.divide(norm, axis='index')
popDF

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
17850,0.267261,0.267261,0.267261,0.267261,0.267261,0.267261,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
13047,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.109109,0.109109,0.109109,0.109109,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
12583,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
13748,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
15100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
15291,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.147442,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
14688,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
15311,0.000000,0.000000,0.047298,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
16098,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
18074,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0


In [303]:
userMapping = dict(zip(popDF.index, list(range(0,popDF.shape[0]))))
itemMapping = dict(zip(list(range(0,popDF.shape[1])), popDF.columns))

In [304]:
# Convert popDF to a matrix for item-item computation
trainMatrix = train.as_matrix()
testMatrix = test.as_matrix()


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [281]:
itemSimilarity = calculate_similarity(train) #pairwise_distances(trainMatrix.T, metric='cosine')

In [282]:
itemSimilarity

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
85123A,1.000000,0.113996,0.058765,0.088775,0.066835,0.073272,0.150983,0.041780,0.031840,0.082332,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.013180,0.0
71053,0.113996,1.000000,0.023316,0.015474,0.045320,0.030837,0.058993,0.050877,0.016117,0.028526,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
22752,0.058765,0.023316,1.000000,0.013296,0.022602,0.052479,0.049213,0.054033,0.044761,0.037171,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
21730,0.088775,0.015474,0.013296,1.000000,0.008748,0.041714,0.006973,0.002912,0.007603,0.022002,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
22633,0.066835,0.045320,0.022602,0.008748,1.000000,0.286651,0.063869,0.043325,0.043847,0.111259,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
22632,0.073272,0.030837,0.052479,0.041714,0.286651,1.000000,0.045485,0.036655,0.030458,0.048394,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
84879,0.150983,0.058993,0.049213,0.006973,0.063869,0.045485,1.000000,0.061222,0.058488,0.046835,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
22745,0.041780,0.050877,0.054033,0.002912,0.043325,0.036655,0.061222,1.000000,0.612432,0.103117,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
22748,0.031840,0.016117,0.044761,0.007603,0.043847,0.030458,0.058488,0.612432,1.000000,0.156070,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0
22749,0.082332,0.028526,0.037171,0.022002,0.111259,0.048394,0.046835,0.103117,0.156070,1.000000,...,0.090109,0.090109,0.090109,0.090109,0.090109,0.090109,0.0,0.0,0.000000,0.0


In [283]:
def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

In [243]:
def predictByItemSimilarity(trainSet, numUsers, numItems, similarity):
    # Initialize the predicted rating matrix with zeros
    predictionMatrix = np.zeros((numItems, numUsers))
    
    for (user,item), rating in np.ndenumerate(trainSet):
        # Predict rating for every user that wasn't ranked by the user (rating == 0)
        if rating == 0:
            # Extract the users that provided rating for this item
            itemVector = trainSet[:,item]
            
            usersRatings = itemVector[itemVector.nonzero()]
            
            # Get the similarity score for each of the items that provided rating for this item
           
            usersSim = similarity[user,:][itemVector.nonzero()]
             
            # If there no items that were ranked by this user, use item's average
            if len(usersSim) == 0:
                userVector = trainSet[user, :]
                ratedItems = userVector[userVector.nonzero()]
                
                # If the items werent rated use 0, otherwise use average
                if len(ratedItems) == 0:
                    predictionMatrix[user,item] = 0
                else:
                    predictionMatrix[user,item] = ratedItems.mean()
            else:
                # predict score based on item-item similarity
                if(usersSim.sum() == 0):
                    predictionMatrix[user,item] = 0
                else:
                    predictionMatrix[user,item] = (usersRatings*usersSim).sum() / usersSim.sum()
        
        # report progress every 100 users
        if (user % 100 == 0 and item == 1):
            print ("calculated %d users" % (user,))
    

    return predictionMatrix

In [285]:
predictionMatrix = predictByItemSimilarity(trainMatrix.T, numCustomers, numItems, itemSimilarity.as_matrix())

  """Entry point for launching an IPython kernel.


calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 1000 users
calculated 1100 users
calculated 1200 users
calculated 1300 users
calculated 1400 users
calculated 1500 users
calculated 1600 users
calculated 1700 users
calculated 1800 users
calculated 1900 users
calculated 2000 users
calculated 2100 users
calculated 2200 users
calculated 2300 users
calculated 2400 users
calculated 2500 users
calculated 2600 users
calculated 2700 users
calculated 2800 users
calculated 2900 users
calculated 3000 users
calculated 3100 users
calculated 3200 users
calculated 3300 users
calculated 3400 users
calculated 3500 users
calculated 3600 users


In [286]:
def userTopK(prediction, itemMapping, userMapping, userID, k):
    # Pick top K based on predicted rating
    userID = userMapping[userID]
    userVector = prediction[:,userID]
    topK = nlargest(k, range(len(userVector)), userVector.take)
    namesTopK = [itemMapping[item] for item in topK] #list(map(lambda x: itemSet[moviesDataset.movieID == x+1]["movieTitle"].values[0], topK))
    return namesTopK

In [291]:
li = userTopK(predictionMatrix, itemMapping, userMapping, 15134, 10)

In [294]:
li

['22258',
 '23221',
 '85049A',
 '84970S',
 '22197',
 '22730',
 '21479',
 '22383',
 '22716',
 '22941']

In [306]:
test[li].loc[15134]

22258     0
23221     0
85049A    0
84970S    0
22197     0
22730     0
21479     0
22383     0
22716     0
22941     0
Name: 15134, dtype: int64

In [305]:
test

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
14142,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15134,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17732,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17828,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16686,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
14682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
