In [150]:
import pandas as pd
import numpy as np
import random

In [151]:
dsPath = 'ml-latest-small/ratings.csv'
df = pd.read_csv(dsPath)

print(df.head())
print("\nRow num: ", df.shape[0])

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Row num:  100836


In [152]:
print(df.shape[0]/df['userId'].unique().shape[0])

165.30491803278687


In [153]:
def getCoRatedItems(df, userX, userY):
    xRatings = df[df['userId'] == userX]
    yRatings = df[df['userId'] == userY]

    return pd.merge(xRatings, yRatings, on='movieId', how='inner')

userX, userY = 1, 2
print(getCoRatedItems(df, userX, userY))

   userId_x  movieId  rating_x  timestamp_x  userId_y  rating_y  timestamp_y
0         1      333       5.0    964981179         2       4.0   1445715029
1         1     3578       5.0    964980668         2       4.0   1445714885


In [154]:
def pearsonSimilarity(df, userX, userY):
    coRatedItems = getCoRatedItems(df, userX, userY)

    if coRatedItems.empty:
        return 0

    xRating = coRatedItems['rating_x']
    yRating = coRatedItems['rating_y']

    xMean = np.mean(xRating)
    yMean = np.mean(yRating)

    den = np.sqrt(np.sum((xRating - xMean)**2)) * np.sqrt(np.sum((yRating - yMean)**2))
    if den == 0:
        return 0
    else:
        return np.sum((xRating - xMean) * (yRating - yMean)) / den

userX, userY = 1, 22
print(pearsonSimilarity(df, userX, userY))

-0.46303926413858626


In [155]:
def cosineSimilarity(df, userX, userY):
    coRatedItems = getCoRatedItems(df, userX, userY)

    if coRatedItems.empty:
        return 0

    xRating = coRatedItems['rating_x']
    yRating = coRatedItems['rating_y']

    ratingMean = np.mean(df[df['movieId'].isin(coRatedItems['movieId'])]['rating'])
    
    num = np.dot(xRating - ratingMean, yRating - ratingMean)
    den = np.linalg.norm(xRating - ratingMean) * np.linalg.norm(yRating - ratingMean)

    if den == 0:
        return 0
    else:
        return num / den

userX, userY = 1, 22
print(cosineSimilarity(df, userX, userY))

-0.49940970124479467


In [156]:
def triangleSimilarity(df, userX, userY):
    coRatedItems = getCoRatedItems(df, userX, userY)

    if coRatedItems.empty:
        return 0

    xRating = coRatedItems['rating_x']
    yRating = coRatedItems['rating_y']

    xMean = np.mean(xRating)
    yMean = np.mean(yRating)

    num = np.sqrt(np.sum((xRating - yRating)**2))
    den = np.sqrt(np.sum((xRating)**2)) + (np.sqrt(np.sum((yRating)**2)))
    triangle = 1 - num / den

    absMeanErr = abs(xMean - yMean)
    absStdErr = abs(xRating.std(ddof=0) - yRating.std(ddof=0))
    userRatingPref = 1 - 1 / (1 +  np.exp(- absMeanErr * absStdErr))

    return triangle * userRatingPref

userX, userY = 1, 22
print(triangleSimilarity(df, userX, userY))

0.21648942347191438


In [157]:
def jaccardSimilarity(df, userX, userY):
    coRatedItems = getCoRatedItems(df, userX, userY)

    if coRatedItems.empty:
        return 0

    xRating = set(coRatedItems['rating_x'])
    yRating = set(coRatedItems['rating_y'])

    intersection = len(xRating.intersection(yRating))
    union = len(xRating.union(yRating))

    if union == 0:
        return 0
    return intersection / union

userX, userY = 1, 22
print(jaccardSimilarity(df, userX, userY))

0.4


In [158]:
def euclideanDistance(df, userX, userY):
    coRatedItems = getCoRatedItems(df, userX, userY)

    if coRatedItems.empty:
        return 0

    xRating = coRatedItems['rating_x']
    yRating = coRatedItems['rating_y']

    return 1 / (1 + np.sqrt(np.sum((xRating - yRating)**2)))

userX, userY = 1, 22
print(euclideanDistance(df, userX, userY))

0.16015261286229274


In [159]:
def manhattanDistance(df, userX, userY):
    coRatedItems = getCoRatedItems(df, userX, userY)

    if coRatedItems.empty:
        return 0

    xRating = coRatedItems['rating_x']
    yRating = coRatedItems['rating_y']

    return 1 / (1 + np.sum(np.abs(xRating - yRating)))

userX, userY = 1, 22
print(manhattanDistance(df, userX, userY))

0.08333333333333333


In [160]:
def getNeighbors(df, user, item = None, blacklist = [], similarityFun = pearsonSimilarity, simTh = 0.6, k = 30, overlapTh = 10):
    neighbors = []

    for candidate in df['userId'].unique():
        coRatedItemsNum = getCoRatedItems(df, user, candidate).shape[0]
        
        if user != candidate and candidate not in blacklist and coRatedItemsNum >= overlapTh:
            sim = similarityFun(df, user, candidate)
            if item == None:
                if abs(sim) > simTh:
                    neighbors.append((candidate, sim))
                    if simTh != 0 and len(neighbors) == k:
                        break
            else:
                if not df[(df['userId'] == candidate) & (df['movieId'] == item)].empty:
                    if abs(sim) > simTh:
                        neighbors.append((candidate, sim))
                        if simTh != 0 and len(neighbors) == k:
                            break

    return sorted(neighbors, key=lambda x: abs(x[1]), reverse=True)[:k]

user, item = 1, 193
print(getNeighbors(df, user))
print(getNeighbors(df, user, item, [], pearsonSimilarity, 0.3, 15))

[(476, 0.7869358789643607), (210, 0.7676494735787385), (71, -0.7304869991056846), (297, 0.7062814666082009), (44, 0.6844475644404534), (394, 0.6506000486323554), (248, 0.6246950475544242), (369, 0.6120977479108417)]
[(414, 0.41186410352090486), (604, -0.4075557568177074), (589, -0.38807526285316635), (524, 0.384954892146264), (136, 0.38245559905852544), (337, -0.3444879190396), (590, 0.3111824770205233)]


In [161]:
def customGetNeighbors(df, user, item, blacklist = [], similarityFun = pearsonSimilarity, k1 = 5, k2 = 15):
    neighbors = []

    simBasedNeighbors = getNeighbors(df, user, None, blacklist, similarityFun, 0.6, k1)
    neighbors.extend(simBasedNeighbors)

    itemBasedNeighbors = getNeighbors(df, user, item, blacklist, similarityFun, 0.4, k2)
    neighbors.extend(itemBasedNeighbors)
    
    return neighbors

user, item = 1, 193
print(customGetNeighbors(df, user, item))

[(210, 0.7676494735787385), (71, -0.7304869991056846), (297, 0.7062814666082009), (44, 0.6844475644404534), (248, 0.6246950475544242), (414, 0.41186410352090486), (604, -0.4075557568177074)]


In [175]:
def basePred(df, user, item, similarityFun = pearsonSimilarity):
    neighbors = getNeighbors(df, user, item, [], similarityFun, 0.3)
    uMean = np.mean(df[df['userId'] == user]['rating'])
    num, den = 0, 0

    for neighbor, sim in neighbors:
        nRating = df[(df['userId'] == neighbor) & (df['movieId'] == item)]['rating']

        if not nRating.empty:
            nMean = np.mean(df[df['userId'] == neighbor]['rating'])
            num += sim * (nRating.values[0] - nMean)
            den += abs(sim)

    if den == 0 or neighbors == []:
        return 0

    return uMean + num / den

user, item = 1, 14   
print(basePred(df, user, item))

4.048833148287396


In [173]:
def recursivePred(df, user, item, lev = 0, blacklist = [], similarityFun = pearsonSimilarity, lmb = 0.5, levTh = 1):
    if lev >= levTh:
        return basePred(df, user, item)

    neighbors = customGetNeighbors(df, user, item, blacklist, similarityFun)
    uMean = np.mean(df[df['userId'] == user]['rating'])
    num, den = 0, 0
    
    for neighbor, sim in neighbors:
        nRating = df[(df['userId'] == neighbor) & (df['movieId'] == item)]['rating']
        nMean = np.mean(df[df['userId'] == neighbor]['rating'])

        if not nRating.empty:
            num += sim * (nRating.values[0] - nMean)
            den += abs(sim)
        else:
            blacklist.append(user)
            num += lmb * sim * (recursivePred(df, neighbor, item, lev+1, blacklist.copy()) - nMean)
            den += lmb * abs(sim)            

    if den == 0 or neighbors == []:
        return 0
    else:
        return uMean + num / den

user, item = 1, 14  
print(recursivePred(df, user, item))

3.6584707329901596


In [182]:
def getRecommendedItems(df, user, rateTh = 4.5, predTh = 4, k1 = 10, k2 = 10, similarityFun=pearsonSimilarity):
    uItems = set(df[df['userId'] == user]['movieId'])
    
    neighbors = getNeighbors(df, user, None, [], similarityFun, 0.6, k1)
    
    nItems = set()
    for neighbor, _ in neighbors:
        tmp = set(df[(df['userId'] == neighbor) & (df['rating'] >= rateTh)]['movieId'])
        nItems.update(tmp)

    notYetRated = list(nItems - uItems)
    random.shuffle(notYetRated)

    item2Pred = []
    for item in notYetRated:
        pred = basePred(df, user, item)

        if pred >= predTh:
            item2Pred.append((item, pred))
        if predTh != 0 and len(item2Pred) == k2:
            break

    return sorted(item2Pred, key=lambda x: x[1], reverse=True)[:k2]

user = 1  
print(getRecommendedItems(df, user))

117
[(146662, 5.286669165417291), (112175, 4.88001723555745), (134130, 4.679474447290951), (27904, 4.644896633115004), (1356, 4.4568217827486), (5816, 4.434219049742726), (187593, 4.368247430459392), (2011, 4.358354926357309), (77561, 4.350662134333023), (5378, 4.096360517208893)]


In [178]:
user = 1
uItems = list(df[df['userId'] == user]['movieId'])[:20]

pearsonPed, cosinePred, trianglePred, jaccardPred, euclideanPred, manhattanPred  = [], [], [], [], [], [] 

for item in uItems:
    itemPred = df[(df['userId'] == user) & (df['movieId'] == item)]['rating'].values[0]

    pearsonPed.append(abs(recursivePred(df, user, item, 0, [], pearsonSimilarity) - itemPred))
    cosinePred.append(abs(recursivePred(df, user, item, 0, [], cosineSimilarity) - itemPred))
    trianglePred.append(abs(recursivePred(df, user, item, 0, [], triangleSimilarity) - itemPred))
    jaccardPred.append(abs(recursivePred(df, user, item, 0, [], jaccardSimilarity) - itemPred))
    euclideanPred.append(abs(recursivePred(df, user, item, 0, [], euclideanDistance) - itemPred))
    manhattanPred.append(abs(recursivePred(df, user, item, 0, [], manhattanDistance) - itemPred))

print("Pearson mean err: ", np.mean(pearsonPed))
print("Cosine mean err: ", np.mean(cosinePred))
print("Triangle mean err: ", np.mean(trianglePred))
print("Jaccard mean err: ", np.mean(jaccardPred))
print("Euclidean mean err: ", np.mean(euclideanPred))
print("Manhattan mean err: ", np.mean(manhattanPred))

Pearson mean err:  0.48808042453277645
Cosine mean err:  0.4955752140026418
Triangle mean err:  0.6401158513392717
Jaccard mean err:  0.7385961782427192
Euclidean mean err:  4.3
Manhattan mean err:  4.3


In [183]:
user = 1
uItems = list(df[df['userId'] == user]['movieId'])[:50]

standardPred, recPred  = [], []

for item in uItems:
    itemPred = df[(df['userId'] == user) & (df['movieId'] == item)]['rating'].values[0]

    standardPred.append(abs(basePred(df, user, item) - itemPred))
    recPred.append(abs(recursivePred(df, user, item) - itemPred))

print("Standard prediction mean err: ", np.mean(standardPred))
print("Recursive prediction mean err: ", np.mean(recPred))

Standard prediction mean err:  0.4639029195840401
Recursive prediction mean err:  0.414504085090476
