In [1]:
!rm -rf ratings* books* to_read* test*

!curl -o ratings.csv "http://www.dcs.gla.ac.uk/~craigm/recsysH/coursework/final-ratings.csv" 
!curl -o books.csv "http://www.dcs.gla.ac.uk/~craigm/recsysH/coursework/final-books.csv"
!curl -o to_read.csv "http://www.dcs.gla.ac.uk/~craigm/recsysH/coursework/final-to_read.csv"
!curl -o test.csv "http://www.dcs.gla.ac.uk/~craigm/recsysH/coursework/final-test.csv"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7631k  100 7631k    0     0  3367k      0  0:00:02  0:00:02 --:--:-- 3369k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2366k  100 2366k    0     0  1777k      0  0:00:01  0:00:01 --:--:-- 1779k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7581k  100 7581k    0     0  4709k      0  0:00:01  0:00:01 --:--:-- 4706k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1895k  100 1895k    0     0  1038k      0  0:00:01  0:00:01 --:--:-- 1037k


In [2]:
#Standard setup
import pandas as pd
import numpy as np
import torch
!pip install git+https://github.com/cmacdonald/spotlight.git@master#egg=spotlight
from spotlight.interactions import Interactions
SEED=20
BPRMF=None

Collecting spotlight
  Cloning https://github.com/cmacdonald/spotlight.git (to revision master) to /tmp/pip-install-qonfzs21/spotlight_2298eb45cd964dfeb9de5007e8cd5528
  Running command git clone -q https://github.com/cmacdonald/spotlight.git /tmp/pip-install-qonfzs21/spotlight_2298eb45cd964dfeb9de5007e8cd5528
Building wheels for collected packages: spotlight
  Building wheel for spotlight (setup.py) ... [?25l[?25hdone
  Created wheel for spotlight: filename=spotlight-0.1.6-py3-none-any.whl size=34106 sha256=03e55975a13956ccc7d14ab47ac7e75700973a1d29a6afd68219f854a5045277
  Stored in directory: /tmp/pip-ephem-wheel-cache-kjoslf4a/wheels/1c/2a/31/d187173520bc800643df4e3d1f97dee21d2133ba41085704ed
Successfully built spotlight
Installing collected packages: spotlight
Successfully installed spotlight-0.1.6


In [None]:
#load in the csv files
ratings_df = pd.read_csv("ratings.csv")
books_df = pd.read_csv("books.csv")
to_read_df = pd.read_csv("to_read.csv")
test = pd.read_csv("test.csv")

In [None]:
#cut down the number of items and users
counts=ratings_df[ratings_df["book_id"] < 2000].groupby(["book_id"]).count().reset_index()
valid_books=counts[counts["user_id"] >= 10][["book_id"]]

books_df = books_df.merge(valid_books, on="book_id")
ratings_df = ratings_df[ratings_df["user_id"] < 2000].merge(valid_books, on="book_id")
to_read_df = to_read_df[to_read_df["user_id"] < 2000].merge(valid_books, on="book_id")
test = test[test["user_id"] < 2000].merge(valid_books, on="book_id")


#stringify the id columns
def str_col(df):
  if "user_id" in df.columns:
    df["user_id"] = "u" + df.user_id.astype(str)
  if "book_id" in df.columns:
    df["book_id"] = "b" + df.book_id.astype(str)

str_col(books_df)
str_col(ratings_df)
str_col(to_read_df)
str_col(test)

In [None]:
from collections import defaultdict
from itertools import count

from spotlight.cross_validation import random_train_test_split

iid_map = defaultdict(count().__next__)


rating_iids = np.array([iid_map[iid] for iid in ratings_df["book_id"].values], dtype = np.int32)
test_iids = np.array([iid_map[iid] for iid in test["book_id"].values], dtype = np.int32)
toread_iids = np.array([iid_map[iid] for iid in to_read_df["book_id"].values], dtype = np.int32)


uid_map = defaultdict(count().__next__)
test_uids = np.array([uid_map[uid] for uid in test["user_id"].values], dtype = np.int32)
rating_uids = np.array([uid_map[uid] for uid in ratings_df["user_id"].values], dtype = np.int32)
toread_uids = np.array([uid_map[iid] for iid in to_read_df["user_id"].values], dtype = np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


rating_dataset = Interactions(user_ids=rating_uids,
                               item_ids=rating_iids,
                               ratings=ratings_df["rating"].values,
                               num_users=len(uid_rev_map),
                               num_items=len(iid_rev_map))

toread_dataset = Interactions(user_ids=toread_uids,
                               item_ids=toread_iids,
                               num_users=len(uid_rev_map),
                               num_items=len(iid_rev_map))

test_dataset = Interactions(user_ids=test_uids,
                               item_ids=test_iids,
                               num_users=len(uid_rev_map),
                               num_items=len(iid_rev_map))

print(rating_dataset)
print(toread_dataset)
print(test_dataset)

#here we define the validation set
toread_dataset_train, validation = random_train_test_split(toread_dataset, random_state=np.random.RandomState(SEED))

num_items = test_dataset.num_items
num_users = test_dataset.num_users

<Interactions dataset (1999 users x 1826 items x 124762 interactions)>
<Interactions dataset (1999 users x 1826 items x 135615 interactions)>
<Interactions dataset (1999 users x 1826 items x 33917 interactions)>


In [None]:
def getAuthorTitle(iid):
  bookid = iid_rev_map[iid]
  row = books_df[books_df.book_id == bookid]
  return row.iloc[0]["authors"] + " / " + row.iloc[0]["title"]

print("iid 0: " + getAuthorTitle(0) )

iid 0: Carlos Ruiz Zafón, Lucia Graves / The Shadow of the Wind (The Cemetery of Forgotten Books,  #1)


In [None]:
from spotlight.evaluation import mrr_score, precision_recall_score

class dummymodel:
  
  def __init__(self, numitems):
    self.predictions=np.zeros(numitems)
  
  #uid is the user we are requesting recommendations for;
  #returns an array of scores, one for each item
  def predict(self, uid):
    #this model returns all zeros, regardless of userid
    return( self.predictions )

#lets evaluate how the effeciveness of dummymodel

print(mrr_score(dummymodel(num_items), test_dataset, train=rating_dataset, k=100).mean())
#as expected, a recommendation model that gives 0 scores for all items obtains a MRR score of 0

0.0


In [None]:
#note that mrr_score() displays a progress bar if you set verbose=True
print(mrr_score(dummymodel(num_items), test_dataset, train=rating_dataset, k=100, verbose=True).mean())


1999it [00:00, 2811.63it/s]

0.0





#  Combination of Recommendation Models

## Explicit & Implicit Matrix Factorisation Models



In [None]:
# Add your solution here
from spotlight.factorization.explicit import ExplicitFactorizationModel
import time  

emodel = ExplicitFactorizationModel(n_iter=10,
                                    embedding_dim=32, #this is Spotlight default
                                    use_cuda=False,
                                    random_state=np.random.RandomState(SEED) # ensure results are repeatable
)
emodel.fit(rating_dataset,verbose=True)

Epoch 0: loss 3.8710271667261593
Epoch 1: loss 0.7940810446123607
Epoch 2: loss 0.6382512643200452
Epoch 3: loss 0.5217335281557725
Epoch 4: loss 0.44844855655167926
Epoch 5: loss 0.40543351120880394
Epoch 6: loss 0.3823863151254224
Epoch 7: loss 0.36336620252762664
Epoch 8: loss 0.35137936695799477
Epoch 9: loss 0.3396692546237199


In [None]:
from spotlight.evaluation import mrr_score
mrr_score(emodel,test_dataset,train=rating_dataset,k=100,verbose=True).mean()

1999it [00:02, 957.71it/s] 


0.05898399982013507

In [None]:
from spotlight.factorization.implicit import ImplicitFactorizationModel
import time  

imodel = ImplicitFactorizationModel(n_iter=10, 
                                    embedding_dim=32, #this is Spotlight default
                                    use_cuda=False,
                                    random_state=np.random.RandomState(SEED) # ensure results are repeatable
)
imodel.fit(toread_dataset_train,verbose=True)

Epoch 0: loss 0.7677980539090229
Epoch 1: loss 0.53877861825925
Epoch 2: loss 0.47017199658560305
Epoch 3: loss 0.428322009882837
Epoch 4: loss 0.39839018825090156
Epoch 5: loss 0.368275504770144
Epoch 6: loss 0.3473479778699155
Epoch 7: loss 0.32980164804689166
Epoch 8: loss 0.31870100696413023
Epoch 9: loss 0.3048194432103971


In [None]:
mrr_score(imodel,test_dataset,train=toread_dataset,k=100,verbose=True).mean()

1999it [00:02, 966.74it/s]


0.3344826076413641

In [None]:
BPRMF=ImplicitFactorizationModel(n_iter=10, 
                                    embedding_dim=32, #this is Spotlight default
                                    use_cuda=False,
                                    random_state=np.random.RandomState(SEED), # ensure results are repeatable
                                  loss='bpr'
)
BPRMF.fit(toread_dataset_train,verbose=True)

Epoch 0: loss 0.33895447579616644
Epoch 1: loss 0.19644999289709442
Epoch 2: loss 0.15870640168563938
Epoch 3: loss 0.14147728193059284
Epoch 4: loss 0.132827276100387
Epoch 5: loss 0.12213623321632731
Epoch 6: loss 0.11668406535853755
Epoch 7: loss 0.11047121562626001
Epoch 8: loss 0.10888675406996934
Epoch 9: loss 0.10400472129782978


In [None]:
mrr_score(BPRMF,test_dataset,train=toread_dataset,k=100,verbose=True).mean() #mrr_score the higher the better

1999it [00:02, 947.26it/s]


0.41698419515356516

##  Hybrid Model



In [None]:
def test_Hybrid_a(combsumObj):
  for i, u in enumerate([5, 20]):
    print("Hybrid a test case %d" % i)
    print(np.count_nonzero(combsumObj.predict(u) > 1))

def test_Hybrid_b(pipeObj):
  for i, iid in enumerate([3, 0]):
    print("Hybrid b test case %d" % i)
    print(pipeObj.predict(0)[iid])



In [None]:
# Add your solutions here and evaluate them

class linearModel:
  def _init_(self, model1, model2):
      self.model1 = model1
      self.model2 = model2
 
  def predict(self, uid):
      normalisation_Score_1 = minmax_scale(self.model1.predict(uid), feature_range=(0,1), axis=0)
      normalisation_Score_2 = minmax_scale(self.model2.predict(uid), feature_range=(0,1), axis=0)
      return (normalisation_Score_1 + normalisation_Score_2)  #why returns 0 regardless of uiid?? #check later

In [None]:
class pipeModel:
  def _init_(self, model1, model2):
      self.model1 = model1
      self.model2 = model2
 
  def predict(self, uid):
    model_1_prediction = self.model_1.prediction(uid)
    model_2_prediction = self.model_2.prediction(uid)
    
    top_100_imf = np.argsort(model_1_prediction)[-100:][::-1]#top 100 items
    not_in_imf = np.argsort(model_1_prediction)[::-1][100:] #nearest
 
    final_prediction = model_1_prediction
 
    final_prediction[top_100_imf] = model_2_predict[top_100_imf] #reranking
    final_prediction[not_in_imf] = 0    #assinging zero for remaining predictions
 
    return final_prediction

In [None]:
#Now test your hybrid approaches for the quiz

test_Hybrid_a(linearModel)
test_Hybrid_b(pipeModel)


Hybrid a test case 0


TypeError: ignored

#  Analysing Recommendation Models

In [None]:
from typing import Sequence, Tuple

def get_top_K(model, uid : int, k : int) -> Tuple[ Sequence[int], Sequence[float],  np.ndarray ] :
  #returns iids, their (normalised) scores in descending order, and item emebddings for the top k predictions of the given uid.

  from sklearn.preprocessing import minmax_scale

  from scipy.stats import rankdata
  # get scores from model
  scores = model.predict(uid)

  # map scores into rank 0..1 over the entire item space
  scores = minmax_scale(scores)

  #compute their ranks  
  ranks = rankdata(-scores)
  
  # get and filter iids, scores and embeddings
  rtr_scores = scores[ranks <= k]
  rtr_iids = np.argwhere(ranks <= k).flatten()
  if hasattr(model, '_net'):
    embs = model._net.item_embeddings.weight[rtr_iids]
  else:
    # not a model that has any embeddings
    embs = np.zeros([k,1])
  
  # identify correct ordering using numpy.argsort()
  ordering = (-1*rtr_scores).argsort()
  
  #return iids, scores and their embeddings in descending order of score
  return rtr_iids[ordering], rtr_scores[ordering], embs[ordering]

if BPRMF is not None:
  iids, scores, embs = get_top_K(BPRMF, 0, 10)
  print("Returned iids: %s" % str(iids))
  print("Returned scores: %s" % str(scores))
  print("Returned embeddings: %s" % str(embs))
else:
  print("You need to define BPRMF in Task 1")



Returned iids: [ 23 108  21  33   9  81  52 254  16   3]
Returned scores: [1.         0.9895131  0.9848315  0.92250896 0.9070817  0.90654314
 0.9005319  0.89310133 0.88378096 0.8836929 ]
Returned embeddings: tensor([[-0.0453,  1.3716, -0.8307, -1.2616,  1.6700,  1.0161,  1.1168,  2.3530,
         -1.2027,  0.8522, -1.0941, -0.6865, -0.5725, -2.0335, -1.2591,  0.6154,
         -0.1374, -1.6868, -1.8615, -0.7514,  1.9909, -0.3909,  1.9239,  1.3293,
         -1.2834, -0.4520,  1.1338,  0.3467,  2.5169, -2.1587,  1.2310,  1.1670],
        [ 0.1239,  1.1004,  0.0531, -1.1045,  1.9932,  1.5049,  1.0011,  1.9734,
         -1.6322, -0.8913, -0.6372,  0.7721, -1.1422, -2.2424, -1.1936, -0.5770,
          0.0762, -1.0283, -1.2807, -2.0889,  2.8154, -0.9600, -0.1419,  0.8408,
         -1.6067, -1.2905,  1.9169,  1.3988,  1.8646, -2.2028,  0.5365,  0.2022],
        [ 0.3845,  0.8188, -0.1892, -1.1793,  2.1731,  0.6669,  1.1271,  1.4538,
         -1.2173, -0.5447, -1.6713,  0.5249, -0.6132, -3.1082

## Evaluation of Non-personalised Models


In [None]:
class StaticModel:
  
  def __init__(self, staticscores):
    self.numitems = len(staticscores)
    #print(self.numitems)
    assert isinstance(staticscores, np.ndarray), "Expected a numpy array"
    assert staticscores.dtype == np.float32 or staticscores.dtype == np.float64, "Expected a numpy array of floats"
    self.staticscores = staticscores
  
  def predict(self, uid):
    #this model returns the same scores for each user    
    return self.staticscores

In [None]:
# Add your solution here
average_rating=ratings_df['rating'].mean
average_rating
mod=StaticModel(average_rating.values)

AttributeError: ignored

## Qualiatively Examining Recommendations



In [None]:
# Add your solution here
def book_recommendation(uid : int):
  #books that the user previously shelved
  book=toread_dataset.item_ids[toread_dataset.user_ids == uid]
  for iid in book:
    previously_shelved=getAuthorTitle(iid)
    print ("A:", previously_shelved)  
  
  #books that the user read in the future
  book=test_dataset.item_ids[test_dataset.user_ids == uid]
  for iid in book:
    future_books=getAuthorTitle(iid)
    print ("B:", future_books)

  #top 10 books recommended by BPRMF
  recommendation_bprmf=get_top_K(BPRMF,uid,10)
  recommendation_bprmf=recommendation_bprmf[0]
  for iid in recommendation_bprmf:
    top_10=getAuthorTitle(iid)
    print("C:", top_10)
  
book_recommendation(1805)

A: Stieg Larsson, Reg Keeland / The Girl Who Kicked the Hornet's Nest (Millennium, #3)
A: Suzanne Collins / Mockingjay (The Hunger Games, #3)
A: Dennis Lehane / Shutter Island
A: Suzanne Collins / Catching Fire (The Hunger Games, #2)
A: Paula Hawkins / The Girl on the Train
A: Robert Ludlum / The Bourne Supremacy (Jason Bourne, #2)
A: John Grisham / The Client
A: Thomas Harris / The Silence of the Lambs  (Hannibal Lecter, #2)
A: Daphne du Maurier, Sally Beauman / Rebecca
A: Robert Ludlum / The Bourne Identity (Jason Bourne, #1)
A: Robert Galbraith, J.K. Rowling / The Cuckoo's Calling (Cormoran Strike, #1)
A: Stephen King / Misery
A: Michael Crichton / Jurassic Park (Jurassic Park, #1)
A: Robert Ludlum / The Bourne Ultimatum (Jason Bourne, #3)
A: Stephen King, Bernie Wrightson / The Stand
A: Michael Crichton / The Andromeda Strain
A: Thomas Harris / Red Dragon (Hannibal Lecter, #1)
A: Lee Child / Die Trying (Jack Reacher, #2)
A: Lee Child / Worth Dying For (Jack Reacher, #15)
A: Lee Chi

#  Diversity of Recommendations



## . Measuring Intra-List Diversity




In [None]:
# Add your solution here
def measure_ild(top_books : Sequence[int], K : int=5) -> float:
  ILD = 0.0
  return ILD

## Task 6. Implement MMR Diversification 



In [None]:
from typing import Sequence
def mmr(iids : Sequence[int], scores : Sequence[float], embs : np.ndarray, alpha : float) -> Sequence[int]:

  assert len(iids) == len(scores)
  assert len(iids) == embs.shape[0]
  assert len(embs.size()) == 2


  rtr_iids=iids
  
  #input your solution here returns a re-ordering of iids, such that the first ranked item is first in the list

  return rtr_iids

In [None]:
def run_MMR_testcases(mmrfn):
  example_embeddings1 = torch.tensor([[1.0,1.0],[1.0,1.0],[0,1.0],[0.1, 1.0]])
  example_embeddings2 = torch.tensor([[1.0,1.0],[1.0,1.0],[0.02,1.0],[0.01,1.0]])
  print("Testcase 0 : %s" % mmrfn([1,2,3,4], [0.5, 0.5, 0.5, 0.5],  example_embeddings1, 0.5)[0] )
  print("Testcase 1 : %s" % mmrfn([1,2,3,4], [0.5, 0.5, 0.5, 0.5],  example_embeddings1, 0.5)[1] )
  print("Testcase 2 : %s" % mmrfn([1,2,3,4], [4, 3, 2, 1],  example_embeddings1, 1)[1] )
  print("Testcase 3 : %s" % mmrfn([1,2,3,4], [0.99, 0.98, 0.97, 0.001],  example_embeddings2, 0.001)[1] )
  print("Testcase 4 : %s" % mmrfn([1,2,3,4], [0.99, 0.98, 0.97, 0.001],  example_embeddings2, 0.5)[1] )

run_MMR_testcases(mmr)

Testcase 0 : 1
Testcase 1 : 2
Testcase 2 : 2
Testcase 3 : 2
Testcase 4 : 2
