# Item-Item Top-N Recommendations

In [1]:
import pandas as pd
train_set_path = 'resources//train_numerized_with_anon.csv'
test_set_path = 'resources//test_numerized_with_anon.csv'

train_set = pd.read_csv(train_set_path, parse_dates=[3], index_col='index')
test_set = pd.read_csv(test_set_path, parse_dates=[3], index_col='index')

users_in_train = train_set.userID.unique()
test_set = test_set[test_set.userID.isin(users_in_train)]

## Part 1: Recommend Most Popular Items 

In [2]:
class MostPopular:

    def __init__(self):
        self.item_ratings_sorted = None
        self.train_set = None

    def learn_model(self, train_set):
        self.train_set = train_set
        self.item_ratings_sorted = \
        train_set.groupby(['itemID'])['userID'].agg(['count']).sort_values(by='count', ascending=False)[
            'count']


    def get_top_n_recommendations(self, test_set, top_n):
        result = {}
        already_ranked_items_by_users = self.train_set.groupby('userID')['itemID'].apply(list)
        
        for userID in test_set.userID.unique():
            result[str(userID)] = []
            top_list = self.item_ratings_sorted.index
            i=0
            j=0
            while i < top_n:
                itemID = top_list[j]
                j = j + 1
                if itemID in already_ranked_items_by_users[userID]:
                    continue
                result[str(userID)].append(itemID)
                i = i + 1
        return result

    def clone(self):
        pass


In [3]:
popular = MostPopular()
popular.learn_model(train_set)
popular_recs = popular.get_top_n_recommendations(test_set,top_n=5)
print(popular_recs['431'])

[53, 26, 68, 85, 16]


## Part 2 - Item-Item Recommendations

In [4]:
from tqdm import tqdm
import numpy as np
import operator

class Jaccard:

    def __init__(self):
        self.item_ratings_sorted = None
        self.train_set = None
        self.item_item_counts = dict()
        self.item_counts = None

    def learn_model(self, train_set):
        print('Started training')
        self.train_set = train_set
        self.item_counts =  self.train_set.groupby('itemID')['userID'].agg('count')
        
        maxpair = 0
        pbar = tqdm(total=len(train_set.userID.unique()))
        
        for u in train_set.userID.unique():
            pbar.update(1)
            userData = self.train_set[self.train_set.userID == u]
            
            if len(userData.index)  < 3:
                continue
            
            for i1 in range(len(userData.index)):
                item1 = userData.values[i1][1]
                
                for i2 in range(i1 + 1,len(userData.index) - 1):
                    item2 = userData.values[i2][1]
                    if item1 not in self.item_item_counts:
                        self.item_item_counts[item1] = dict()
                    if item2 not in self.item_item_counts[item1]:
                        self.item_item_counts[item1][item2] = 0
                    self.item_item_counts[item1][item2] += 1
                    if item2 not in self.item_item_counts:
                        self.item_item_counts[item2] = dict()
                    if item1 not in self.item_item_counts[item2]:
                        self.item_item_counts[item2][item1] = 0
                    self.item_item_counts[item2][item1] += 1
        

        pbar.close()
        print('Done training')
            
            
            

    def get_top_n_recommendations(self, test_set, top_n):
        print('Started computing recommendations')
        result = {}
        already_ranked_items_by_users = self.train_set.groupby('userID')['itemID'].apply(list)
        
        pbar = tqdm(total=len(test_set.userID.unique()))
        
        for userID in test_set.userID.unique():
            pbar.update(1)
            result[str(userID)] = []
            maxvalues = dict()
            
            for i in already_ranked_items_by_users[userID]:
                if i not in self.item_item_counts:
                    continue
                items = self.item_item_counts[i]
                
                
                for j in items:
                    if j in already_ranked_items_by_users[userID]:
                        continue
                        
                    if j not in maxvalues:
                        maxvalues[j] = 0

                    if items[j] > 10:
                        d = items[j] / (self.item_counts[i] + self.item_counts[j] - items[j])
                        if d > maxvalues[j]:
                            maxvalues[j] = d
                        
            top_list = sorted(maxvalues.items(), key=lambda kv : -kv[1])
            i=0
            j=0
            while i < top_n and j < len(top_list):
                itemID = top_list[j][0]
                
                
                j = j + 1
                
                result[str(userID)].append(itemID)
                
                i = i + 1
            
        pbar.close()
        print('Done computing recommendations')
        return result

    
    def clone(self):
        pass


In [5]:
jaccard = Jaccard()
jaccard.learn_model(train_set)
jaccard_recs = jaccard.get_top_n_recommendations(test_set,top_n=5)


Started training


100%|███████████████████████████████████████████████████████████████████████████| 20197/20197 [00:40<00:00, 503.68it/s]


Done training
Started computing recommendations


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:27<00:00, 14.73it/s]


Done computing recommendations


## Part 3 - Comparing the Algorithms 

In [12]:
def compute_precision(test_set, recommendations):
    hits = 0
    recs = 0
    used = 0
    
    for u in test_set.userID.unique():
        userData = test_set[test_set.userID == u]
        used += len(userData)
        userRecs = recommendations.get(str(u))
        recs += len(userRecs)
        
        for i in userRecs:
            if i in userData['itemID'].tolist():
                hits += 1
    return hits / recs
        
    

In [13]:
p1 = compute_precision(test_set,jaccard_recs)
p2 = compute_precision(test_set,popular_recs)
print("Jaccard=", p1, "  Popularity=", p2)

Jaccard= 0.03171806167400881   Popularity= 0.027312775330396475


## Part 4 - Calling Algorithms from the Surprise Package

In [14]:
import sys, string, os
import pandas as pd
import itertools
from tqdm import tqdm
import numpy as np
import operator
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import PredefinedKFold
from surprise.prediction_algorithms import *



class SurpriseRecMethod():

    def __init__(self, method):
        self.method = method

    def fit(self, train_set):
        self.train_set = train_set


    def get_top_n_recommendations(self, test_set, top_n):
        self.test_set = test_set

        test_path_tmp = "resources//test_file.csv"
        train_path_tmp = "resources//train_file.csv"

        self.train_set.to_csv(train_path_tmp, index=False, header=False)
        self.test_set.to_csv(test_path_tmp, index=False, header=False)

        fold_files = [(train_path_tmp, test_path_tmp)]
        reader = Reader(rating_scale=(1, 10), line_format='user item rating', sep=',')
        data = Dataset.load_from_folds(fold_files, reader=reader)

        for trainset, testset in PredefinedKFold().split(data):
            self.method.fit(trainset)

        already_ranked_items_by_users = self.train_set.groupby('userID')['itemID'].apply(list)

        recommendations = {}
        pbar = tqdm(total=len(self.test_set.userID.unique()))
        for userID in self.test_set.userID.unique():
            pbar.update(1)

            if userID not in self.train_set.userID.unique():
                recommendations[str(userID)] = []
                continue

            items_expected_ranking = {}
            for itemID in self.train_set.itemID.unique():
                if itemID in already_ranked_items_by_users[userID]:
                    continue
                # Calc prediction for item for user
                predicted = self.method.predict(str(userID), str(itemID), clip=False)
                items_expected_ranking[itemID] = predicted[3]
            sorted_predictions = sorted(items_expected_ranking.items(), key=operator.itemgetter(1))
            sorted_predictions.reverse()
            sorted_predictions = [x[0] for x in sorted_predictions]
            user_recommendations = sorted_predictions[:top_n]
            recommendations[str(userID)] = user_recommendations
        pbar.close()
        return recommendations


In [15]:
modelSlopeOne = SurpriseRecMethod(SlopeOne())
modelSlopeOne.fit(train_set)
recSlopeOne = modelSlopeOne.get_top_n_recommendations(test_set, 5)
p3 = compute_precision(test_set,recSlopeOne)

100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:03<00:00, 68.54it/s]


In [10]:
modelKNNUser = SurpriseRecMethod(KNNBasic(sim_options={'name': 'cosine', 'user_based': True}))
modelKNNUser.fit(train_set)
recKNNUser = modelKNNUser.get_top_n_recommendations(test_set, 5)
p4 = compute_precision(test_set,recKNNUser)

Computing the cosine similarity matrix...
Done computing similarity matrix.


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:10<00:00, 21.56it/s]


In [11]:
print("Jaccard=", p1, "  Popularity=", p2, "  SlopeOne=", p3, "  User KNN=", p4)

Jaccard= 0.03171806167400881   Popularity= 0.027312775330396475   SlopeOne= 0.03259911894273128   User KNN= 0.07577092511013216
