In [1]:
from fastai.tabular.all import *
from fastai.collab import *

In [2]:
path = untar_data(URLs.ML_100k)

ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None, names=['user','movie','rating','timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
pd.crosstab(ratings.user, ratings.movie, values=ratings.rating, aggfunc='sum')

movie,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [4]:
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1',
                         usecols=(0,1), names=('movie','title'), header=None)
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [5]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,90,"Birdcage, The (1996)",5
1,295,Alice in Wonderland (1951),4
2,466,Jackie Brown (1997),3
3,152,"Long Kiss Goodnight, The (1996)",3
4,536,Notorious (1946),4
5,233,Star Wars (1977),3
6,387,Wes Craven's New Nightmare (1994),2
7,623,Contact (1997),4
8,501,"Hate (Haine, La) (1995)",5
9,275,"Abyss, The (1989)",4


In [6]:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.880374,0.959078,00:20
1,0.662507,0.903464,00:10
2,0.528663,0.875617,00:10
3,0.461903,0.864924,00:11
4,0.437305,0.859917,00:11


In [7]:
learn.export('movie-recommender.pkl')

In [8]:
learn = load_learner('movie-recommender.pkl')

learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 50)
  (i_weight): Embedding(1665, 50)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

In [27]:
user_ratings = [(318, 1), (50, 1), (260, 1), (527, 1), (1721, 1), (1685, 5)]

In [28]:
user_ratings_dicts = []
for (movie_id, rating) in user_ratings:
    user_ratings_dicts.append({"user": 10000, "movie": movie_id, "rating": rating})

new_ratings = pd.concat([ratings, pd.DataFrame(user_ratings_dicts)], ignore_index=True)
new_ratings.tail()

Unnamed: 0,user,movie,rating,timestamp,title
100001,10000,50,1,,
100002,10000,260,1,,
100003,10000,527,1,,
100004,10000,1721,1,,
100005,10000,1685,5,,


In [29]:
crosstab = pd.crosstab(new_ratings['user'], new_ratings['movie'], values=new_ratings['rating'], aggfunc='sum').fillna(0)
crosstab.tail()

movie,1,2,3,4,5,6,7,8,9,10,...,1675,1676,1677,1678,1679,1680,1681,1682,1685,1721
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
943,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0


In [30]:
other_users = crosstab.values[:-1]
new_user = crosstab.values[-1].reshape(1, -1)

similarities = nn.CosineSimilarity()(tensor(other_users), tensor(new_user))
similarities[:5]

tensor([0.0174, 0.0301, 0.0653, 0.0758, 0.0174])

In [31]:
top5 = similarities.topk(5)
top5

torch.return_types.topk(
values=tensor([0.1354, 0.0960, 0.0934, 0.0907, 0.0887]),
indices=tensor([511, 699, 875, 102, 211]))

In [32]:
user_vectors = learn.u_weight.weight[1+top5.indices,:]

new_user_vector = user_vectors.mean(dim=0, keepdim=True)
new_user_vector

tensor([[-0.0527, -0.1143, -0.0548,  0.1783, -0.1016, -0.1448, -0.0763,  0.0536,
         -0.0298, -0.0303,  0.0637, -0.1193, -0.1556, -0.0814,  0.1642, -0.0132,
          0.0276,  0.0238,  0.1427, -0.0774,  0.0687, -0.0476,  0.0514, -0.1181,
         -0.0966,  0.0518, -0.0150, -0.0485,  0.0768, -0.0794, -0.1169, -0.1021,
         -0.0513, -0.0014,  0.0543, -0.1796, -0.0910, -0.0294,  0.1608,  0.1068,
          0.0101, -0.0459,  0.1718,  0.1099,  0.0841,  0.0689,  0.0944, -0.0725,
          0.0756,  0.0975]], grad_fn=<MeanBackward1>)

In [33]:
user_biases = learn.u_bias.weight[1+top5.indices,:]
new_user_bias = user_biases.mean()
new_user_bias

tensor(0.1006, grad_fn=<MeanBackward0>)

In [34]:
pred_ratings = torch.matmul(new_user_vector, learn.i_weight.weight.T) + learn.i_bias.weight.T + new_user_bias
pred_ratings

tensor([[ 0.1025, -0.1139,  0.2662,  ...,  0.0503,  0.3231,  0.1471]],
       grad_fn=<AddBackward0>)

In [35]:
top5_ratings = pred_ratings.topk(5)
recommendations = learn.classes['title'][top5_ratings.indices.tolist()[0]]

for i, movie in enumerate(recommendations):
    print(f'{i+1}. {movie}')

1. Schindler's List (1993)
2. Star Wars (1977)
3. Casablanca (1942)
4. Shawshank Redemption, The (1994)
5. Godfather, The (1972)
