In [46]:
# import required libraries
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from heapq import nlargest
from sklearn.metrics import mean_squared_error
from math import sqrt
import os.path
import scipy
import scipy.stats as st
import json




## Import Data

In [24]:
with open('training_mixpanel.txt') as f:
    data = json.load(f)
custDF = json_normalize(data)
custDF

Unnamed: 0,event,properties.country,properties.customer_id,properties.description,properties.invoice_date,properties.invoice_no,properties.product_id,properties.quantity,properties.unit_price
0,Purchased Product,United Kingdom,17850,WHITE HANGING HEART T-LIGHT HOLDER,12/1/2010 8:26,536365,85123A,6,2.55
1,Purchased Product,United Kingdom,17850,WHITE METAL LANTERN,12/1/2010 8:26,536365,71053,6,3.39
2,Purchased Product,United Kingdom,17850,SET 7 BABUSHKA NESTING BOXES,12/1/2010 8:26,536365,22752,2,7.65
3,Purchased Product,United Kingdom,17850,GLASS STAR FROSTED T-LIGHT HOLDER,12/1/2010 8:26,536365,21730,6,4.25
4,Purchased Product,United Kingdom,17850,HAND WARMER UNION JACK,12/1/2010 8:28,536366,22633,6,1.85
5,Purchased Product,United Kingdom,17850,HAND WARMER RED POLKA DOT,12/1/2010 8:28,536366,22632,6,1.85
6,Purchased Product,United Kingdom,13047,ASSORTED COLOUR BIRD ORNAMENT,12/1/2010 8:34,536367,84879,32,1.69
7,Purchased Product,United Kingdom,13047,POPPY'S PLAYHOUSE BEDROOM,12/1/2010 8:34,536367,22745,6,2.10
8,Purchased Product,United Kingdom,13047,POPPY'S PLAYHOUSE KITCHEN,12/1/2010 8:34,536367,22748,6,2.10
9,Purchased Product,United Kingdom,13047,FELTCRAFT PRINCESS CHARLOTTE DOLL,12/1/2010 8:34,536367,22749,8,3.75


In [18]:
numItems = len(custDF['properties.product_id'].unique())
numCustomers = len(custDF['properties.customer_id'].unique())

print("Number of users:", numCustomers)
print("Number of items:", numItems)

Number of users: 4363
Number of items: 3677


In [31]:
popDF = pd.DataFrame(0,columns=custDF['properties.product_id'].unique(), index=custDF['properties.customer_id'].unique())
popDF

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
17850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13047,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16098,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18074,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
index = list(zip(custDF['properties.customer_id'], custDF['properties.product_id']))
for item in index:
    popDF.at[item[0], item[1]] = 1
popDF

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
17850,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13047,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
12583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15291,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15311,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16098,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18074,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
userMapping = dict(zip(popDF.index, list(range(0,popDF.shape[0]))))
itemMapping = dict(zip(list(range(0,popDF.shape[1])), popDF.columns))

In [47]:
train, test = train_test_split(popDF, test_size=0.3)

In [48]:
# Convert popDF to a matrix for item-item computation
trainMatrix = train.as_matrix()
testMatrix = test.as_matrix()


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
itemSimilarity = 1 - (pairwise_distances(trainMatrix.T, metric='cosine') )
def predictByItemSimilarity(trainSet, numUsers, numItems, similarity):
    # Initialize the predicted rating matrix with zeros
    predictionMatrix = np.zeros((numItems, numUsers))
    
    for (user,item), rating in np.ndenumerate(trainSet):
        # Predict rating for every user that wasn't ranked by the user (rating == 0)
        if rating == 0:
            # Extract the users that provided rating for this item
            itemVector = trainSet[:,item]
            
            usersRatings = itemVector[itemVector.nonzero()]
            
            # Get the similarity score for each of the items that provided rating for this item
           
            usersSim = similarity[user,:][itemVector.nonzero()]
             
            # If there no items that were ranked by this user, use item's average
            if len(usersSim) == 0:
                userVector = trainSet[user, :]
                ratedItems = userVector[userVector.nonzero()]
                
                # If the items werent rated use 0, otherwise use average
                if len(ratedItems) == 0:
                    predictionMatrix[user,item] = 0
                else:
                    predictionMatrix[user,item] = ratedItems.mean()
            else:
                # predict score based on item-item similarity
                if(usersSim.sum() == 0):
                    predictionMatrix[user,item] = 0
                else:
                    print(usersRatings)
                    predictionMatrix[user,item] = (usersRatings*usersSim).sum() / usersSim.sum()
        
        # report progress every 100 users
        if (user % 100 == 0 and item == 1):
            print ("calculated %d users" % (user,))
    

    return predictionMatrix

In [49]:
predictionMatrix = predictByItemSimilarity(trainMatrix.T, numCustomers, numItems, itemSimilarity)

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 1000 users
calculated 1100 users
calculated 1200 users
calculated 1300 users
calculated 1400 users
calculated 1500 users
calculated 1600 users
calculated 1700 users
calculated 1800 users
calculated 1900 users
calculated 2000 users
calculated 2100 users
calculated 2200 users
calculated 2300 users
calculated 2400 users
calculated 2500 users
calculated 2600 users
calculated 2700 users
calculated 2800 users
calculated 2900 users
calculated 3000 users
calculated 3100 users
calculated 3200 users
calculated 3300 users
calculated 3400 users
calculated 3500 users
calculated 3600 users


In [108]:
itemSimilarity

array([[1.        , 0.20454078, 0.1140864 , ..., 0.        , 0.03829198,
        0.        ],
       [0.20454078, 1.        , 0.07492792, ..., 0.        , 0.        ,
        0.        ],
       [0.1140864 , 0.07492792, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.03829198, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [109]:
predictionMatrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [106]:
def userTopK(prediction, itemMapping, userMapping, userID, k):
    # Pick top K based on predicted rating
    userID = userMapping[userID]
    userVector = prediction[:,userID]
    topK = nlargest(k, range(len(userVector)), userVector.take)
    namesTopK = [itemMapping[item] for item in topK] #list(map(lambda x: itemSet[moviesDataset.movieID == x+1]["movieTitle"].values[0], topK))
    return namesTopK

In [107]:
userTopK(predictionMatrix, itemMapping, userMapping, 16464, 10)

['85123A',
 '71053',
 '22752',
 '21730',
 '22633',
 '22632',
 '84879',
 '22745',
 '22748',
 '22749']

In [94]:
test

Unnamed: 0,85123A,71053,22752,21730,22633,22632,84879,22745,22748,22749,...,90214O,90214T,90214U,90214V,90214W,90214Z,84805A,85225,90089,23843
12812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16147,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14295,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16464,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
16982,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18190,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16611,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
15025,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
