### Import required libraries

In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_colwidth = 100

from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

import heapq
from collections import defaultdict
from operator import itemgetter

### Preprocessing

In [2]:
# Read the data file into a pandas dataframe

df = pd.read_csv('luxury_beauty_cleaned.csv')
df.head()

Unnamed: 0,rating,userId,itemId,userName,description,title,price,rating_count,rating_avg
0,2.0,A1Q6MUU0B2ZDQG,B00004U9V2,D. Poston,"['After a long day of handling thorny situations, our new hand therapy pump is just the help you...",Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ,30.0,582,4.623711
1,5.0,A3HO2SQDCZIE9S,B00004U9V2,chandra,"['After a long day of handling thorny situations, our new hand therapy pump is just the help you...",Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ,30.0,582,4.623711
2,5.0,A2EM03F99X3RJZ,B00004U9V2,Maureen G,"['After a long day of handling thorny situations, our new hand therapy pump is just the help you...",Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ,30.0,582,4.623711
3,5.0,A3Z74TDRGD0HU,B00004U9V2,Terry K,"['After a long day of handling thorny situations, our new hand therapy pump is just the help you...",Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ,30.0,582,4.623711
4,5.0,A2UXFNW9RTL4VM,B00004U9V2,Patricia Wood,"['After a long day of handling thorny situations, our new hand therapy pump is just the help you...",Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ,30.0,582,4.623711


In [3]:
# Create a dictionary itemId_to_title of all unique itemid as key and corresponding title as value

x = df[['itemId','title']]
itemId_to_title = pd.Series(x.title.values,index=x.itemId).to_dict()
itemId_to_title

{'B00004U9V2': "Crabtree &amp; Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ",
 'B00005A77F': 'Crabtree &amp; Evelyn Hand Soap, Gardeners, 10.1 fl. oz.',
 'B00005NDTD': 'Soy Milk Hand Crme',
 'B00005V50C': 'Supersmile Powdered Mouthrinse',
 'B00005V50B': 'Supersmile Professional Teeth Whitening Toothpaste  Recommended By Cosmetic Dentists, CLINICALLY...',
 'B000066SYB': 'Archipelago Morning Mint Body Lotion ,18 Fl Oz',
 'B000068DWY': 'Calvin Klein ck one Eau de Toilette, 3.4 fl. oz.',
 'B00008WFSM': 'Helen of Troy 1193 Professional Flat Iron',
 'B0000Y3NO6': 'DERMAdoctor Calm, Cool &amp; Corrected anti-redness tranquility cream - 1.7 Oz',
 'B00011QUKW': 'HOT TOOLS Professional 24K Gold Marcel Iron/Wand for Long Lasting Results',
 'B000142FVW': 'OPI Nail Lacquer, Not So Bora-Bora-ing Pink, 0.5 Fl Oz',
 'B000141PIG': 'Paul Mitchell Shampoo One',
 'B00014351Q': "OPI Nail Lacquer, She's a Bad Muffuletta!, 0.5 Fl Oz",
 'B000141PYK': 'Paul Mitchell Extra-Body Sculpti

In [4]:
# This is a function to return the title of the product based on the itemId

def getProductTitle(itemId):
    return itemId_to_title[itemId]

In [5]:
# We dont need all the columns from the original dataframe to build our model
# So, we extract only the userId, itemId and rating columns into a new dataframe

df_main = df[['userId', 'itemId', 'rating']]
df_main.shape

(459985, 3)

In [6]:
# We want to keep only unique userId, itemId pairs in our dataframe
# We drop all diplicate userId, itemId pairs
# and keeponly the one tha has highest rating value

df_main = df_main.sort_values('rating', ascending=False).drop_duplicates(subset=['userId', 'itemId'])
df_main.shape

(426731, 3)

cnt_user = df_main['userId'].value_counts()
df_main[df_main['userId'].isin(cnt_user[cnt_user >= 10].index)]

cnt_item = df_main['itemId'].value_counts()
df_main[df_main['itemId'].isin(cnt_item[cnt_item >= 20].index)]

In [7]:
# We will consider only those users who have rated atleast 10 unique products
# and those items that have been rated by atleast 10 different users

cnt_user = df_main['userId'].value_counts()
df_main = df_main[df_main['userId'].isin(cnt_user[cnt_user >= 10].index)]
cnt_item = df_main['itemId'].value_counts()
df_main = df_main[df_main['itemId'].isin(cnt_item[cnt_item >= 10].index)]

In [8]:
# We print some statistics about the final dataframe

print("No. of rows in the final dataframe ", df_main.shape[0])
print("No. of unique users in the final dataframe ", df_main.userId.nunique())
print("No. of unique products in the final dataframe ", df_main.itemId.nunique())

No. of rows in the final dataframe  5405
No. of unique users in the final dataframe  483
No. of unique products in the final dataframe  309


In [9]:
df_main.head()

Unnamed: 0,userId,itemId,rating
272967,A3QL857ALV5RCJ,B005COP4FA,5.0
272964,A2SZLNSI5KOQJT,B005COP4FA,5.0
272953,AWIF8AR75LL9L,B005COP4FA,5.0
273206,AW7BIYHXUIZ62,B005CVGJFM,5.0
273195,A1SLHI58I09T1K,B005CVGJFM,5.0


### Working with surprise library

In [10]:
# We need to define a Reader object for Surprise to be able to parse the dataframe

reader = Reader(rating_scale=(1, 5))

In [11]:
# To load a dataset from a pandas dataframe, we use the load_from_df() method
# The dataframe must have three columns, corresponding to the user (raw) ids, the item (raw) ids, 
# and the ratings in this order

data = Dataset.load_from_df(df_main[['userId', 'itemId', 'rating']], reader)

# We build a trainset using the build_full_trainset() method on the full dataset
# It will build a trainset object

trainset = data.build_full_trainset()

In [12]:
# Let us take a test userId
test_userId = 'A1SLHI58I09T1K'

# Convert the raw userId to surprise inner id
testUserInnerID = trainset.to_inner_uid(test_userId)

### User-based collaborative filtering

In [13]:
# Build a user based similarity matrix by computing similarities between users
# We use the cosine distance metric to calculate similarities

similarity_matrix = KNNBasic(sim_options={
                'name': 'cosine',
               'user_based': True
               }).fit(trainset).compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [14]:
# Get the row from the similarity matrix corresponding to the test user

similarityRow = similarity_matrix[testUserInnerID]

In [15]:
# Get the similarity scores of all users with respect to the test user

similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        similarUsers.append( (innerID, score) )

In [16]:
similarUsers

[(0, 0.0),
 (1, 0.9899494936611665),
 (2, 0.9938837346736188),
 (3, 1.0),
 (5, 1.0),
 (6, 1.0),
 (7, 0.9949366763261821),
 (8, 1.0),
 (9, 0.9374252720097652),
 (10, 0.0),
 (11, 0.9899494936611665),
 (12, 1.0),
 (13, 0.9258200997725515),
 (14, 1.0),
 (15, 0.0),
 (16, 0.9669875568304563),
 (17, 0.9995120760870788),
 (18, 1.0),
 (19, 0.9996303825414302),
 (20, 0.0),
 (21, 0.9958705948858224),
 (22, 1.0),
 (23, 0.9938837346736188),
 (24, 0.0),
 (25, 1.0),
 (26, 0.9938837346736188),
 (27, 0.9684959969581862),
 (28, 0.0),
 (29, 0.0),
 (30, 0.9772954430358936),
 (31, 0.0),
 (32, 1.0),
 (33, 0.0),
 (34, 0.0),
 (35, 1.0),
 (36, 0.0),
 (37, 0.0),
 (38, 1.0),
 (39, 0.9848484848484849),
 (40, 0.0),
 (41, 0.978231976089037),
 (42, 0.0),
 (43, 1.0),
 (44, 1.0),
 (45, 1.0),
 (46, 0.0),
 (47, 1.0),
 (48, 0.0),
 (49, 0.9682773237093576),
 (50, 0.9873515259378667),
 (51, 0.0),
 (52, 0.9981149841863163),
 (53, 0.0),
 (54, 0.968272799301934),
 (55, 0.9804067213387103),
 (56, 0.0),
 (57, 1.0),
 (58, 0.9938

In [17]:
# Get the top 10 nearest neighbors to the test user, based on similarity scores

kNeighbors = heapq.nlargest(10, similarUsers, key=lambda t: t[1])

In [18]:
kNeighbors

[(3, 1.0),
 (5, 1.0),
 (6, 1.0),
 (8, 1.0),
 (12, 1.0),
 (14, 1.0),
 (18, 1.0),
 (22, 1.0),
 (25, 1.0),
 (32, 1.0)]

In [19]:
# Here we prepare a list of all items that were rated by the top 10 similar users
# The rating sum is calculated as a weighted average of the user similarity score

candidates = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainset.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

In [20]:
# Get the list of all items already rated by the test user
# so that those items can be removed from the list of items to be recomended

rated = {}
for itemId, rating in trainset.ur[testUserInnerID]:
    rated[itemId] = 1

In [21]:
# Return the list of top 10 items that were most highly rated by the top 10 similar users

pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in rated:
        item = trainset.to_raw_iid(itemID)
        print(getProductTitle(item))
        pos += 1
        if (pos > 10):
            break

Dermablend Intense Powder High Coverage Foundation
AHAVA Time to Revitalize Extreme Night Treatment
La Roche-Posay Micellar Cleansing Water for Sensitive Skin
La Roche-Posay Toleriane Teint Color Correcting Concealer Pen, 0.35 Fl. Oz.
La Roche-Posay Toleriane Teint Mattifying Mousse Foundation, 1 Fl. Oz.
COLOR WOW Brass Banned Correct and Perfect Mousse,6.8 Oz
iS CLINICAL Cleansing Complex
La Roche-Posay Pigmentclar Dark Spot Cream Face Serum with LHA, 1 Fl. Oz.
JAPONESQUE Velvet Touch Blush
Vichy Ideal Body Skin Firming Lotion with Hyaluronic Acid
Rituals Body Cream, 7.4 fl. oz


### Item Based Collaborative Filtering

In [22]:
sim_options = {'name': 'cosine',
               'user_based': False
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainset)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [23]:
simsMatrix

array([[1., 1., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [24]:
# Get the top K items rated by the test user
testUserRatings = trainset.ur[testUserInnerID]
kNeighbors = heapq.nlargest(10, testUserRatings, key=lambda t: t[1])

In [25]:
kNeighbors

[(1, 5.0),
 (211, 5.0),
 (228, 5.0),
 (248, 5.0),
 (293, 5.0),
 (295, 5.0),
 (233, 4.0),
 (275, 4.0),
 (242, 3.0)]

In [26]:
# Get similar items to the items liked by test user (weighted by rating)
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
    similarityRow = simsMatrix[itemID]
    for innerID, score in enumerate(similarityRow):
        candidates[innerID] += score * (rating / 5.0)

In [27]:
# Build a dictionary of stuff the user has already seen
rated = {}
for itemID, rating in trainset.ur[testUserInnerID]:
    rated[itemID] = 1

In [28]:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in rated:
        item = trainset.to_raw_iid(itemID)
        print(getProductTitle(item))
        pos += 1
        if (pos > 10):
            break

JAPONESQUE Velvet Touch Blush
Obagi Professional-C Serum, 1 fl. oz.
Dermablend Quick-Fix Full Coverage Concealer, 0.16 Oz.
La Roche-Posay Serozinc Face Toner for Oily Skin with Zinc, 5 Fl. Oz.
Vichy Mineral Infused Face Mask
StriVectin Tightening Neck Serum Roller, 1.7 oz.
Dermablend Intense Powder High Coverage Foundation
Meaningful Beauty &ndash; Wrinkle Smoothing Capsules Advanced Formula with Hyaluronic Acid &ndash; 60 Count &ndash; MT.0385
Baxter of California Vitamin Cleansing Bar
Bioderma S&eacute;bium  Pore Refiner Cream - 1 fl. oz.
StriVectin-AR Advanced Retinol Eye Cream, 0.5 fl. oz.
