In [44]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
import pandas as pd
from surprise import Dataset
from surprise import Reader

In [47]:
ratings = pd.read_csv('/content/drive/MyDrive/ml-latest-small/ratings.csv')

In [48]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [49]:
reader = Reader(rating_scale=(0.5, 5))

In [50]:
# Loads Pandas dataframe
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [51]:
from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
knn = KNNWithMeans(sim_options=sim_options)

In [52]:
trainingSet = data.build_full_trainset()

In [53]:
knn.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fef877253d0>

In [54]:
threshold = 4.0

In [55]:
all_users = list(set(ratings['userId'].values))

In [56]:
user2movies = {user:[] for user in all_users}

In [57]:
for i in range(len(ratings)):
  user = ratings['userId'].iloc[i]
  movie = ratings['movieId'].iloc[i]
  user2movies[user].append(movie)

In [58]:
movies_data = pd.read_csv('/content/drive/MyDrive/ml-latest-small/movies.csv')

In [59]:
movies_data.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [60]:
all_movies = set(movies_data['movieId'].values)


In [61]:
len(all_movies)

9742

In [62]:
user2candidates = {user:[] for user in all_users}

In [63]:
for movie_id in all_movies:
  for user in user2movies.keys():
    if movie_id not in user2movies[user]:
      user2candidates[user].append(movie_id)

In [64]:
user2recommend = {user:[] for user in all_users}

In [65]:
for user in user2candidates.keys():
  for movie in user2candidates[user]:
    score = knn.predict(user, movie).est
    if score >= threshold:
      user2recommend[user].append((movie, score))

In [66]:
count = 0
for user in user2recommend.keys():
  count += len(user2recommend[user])
  print(len(user2recommend[user]))
print(count)

5222
1409
118
1061
1285
3060
1050
1859
2606
826
3738
6306
3483
1237
1055
866
2756
2039
534
3500
1287
196
778
1413
5543
694
1951
271
3087
5275
3087
2755
2317
1305
4451
191
4013
933
2605
3730
551
2917
6750
1982
3354
4536
316
4198
3168
313
3732
5099
7734
365
973
3647
1184
3878
4088
1965
2629
3961
1169
2709
2126
3539
1505
656
5063
3388
2054
3388
2654
3551
557
263
2107
849
3811
4794
242
1990
673
2242
1954
2805
3717
3841
361
3700
1048
4470
6007
627
4093
2698
5018
2886
5390
3381
2616
1682
3336
1707
2581
5413
3630
3702
953
1568
1570
1623
2303
570
3232
887
1211
1527
4534
1645
1063
5461
1980
3078
2902
796
321
2968
3221
1603
436
418
405
2516
2608
2049
1911
2531
82
1808
1058
2886
685
1539
1038
1549
2498
883
456
2483
3863
2081
13
5508
2381
1729
2135
2218
607
796
2931
4424
1030
4561
1468
3116
1024
4028
5551
1071
5659
2283
1009
2530
2953
3991
1377
3061
3360
1121
709
2050
2420
2480
2110
4366
1675
4487
1771
2351
2391
3437
742
1485
1400
2019
2238
1682
784
2948
4605
2492
2694
3106
1415
3293
1258
1517
319

In [67]:
ans = []

In [68]:
for user in all_users:
  if len(user2recommend[user]) > 0:
    for movie, score in sorted(user2recommend[user], key=lambda x: x[1], reverse=True)[0:3]:
      ans.append([user, movie, score])

In [69]:
result = pd.DataFrame(data=ans, columns=['userId', 'movieId'])
result.head()

Unnamed: 0,userId,movieId,score
0,1,28,5.0
1,1,131098,5.0
2,1,131104,5.0
3,2,53,5.0
4,2,467,5.0


In [70]:
result.to_csv('/content/drive/MyDrive/recommend.csv',index=False)