In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
!pip install scikit-surprise

from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
import pandas as pd
from collections import defaultdict
from operator import itemgetter
import heapq
import pandas as pd
import numpy as np




In [33]:
df = pd.read_csv('/content/drive/MyDrive/deviants_pg5/complted data/preprocessed_data/movies_small.csv')
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
def load_dataset():
    reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    ratings_dataset = Dataset.load_from_file('/content/drive/MyDrive/deviants_pg5/complted data/preprocessed_data/ratings_small.csv', reader=reader)
    return (ratings_dataset)
dataset = load_dataset()



# Build a full Surprise training set from dataset
trainset = dataset.build_full_trainset()

##**Similarity Matrix**

####**KNNBasic** is basic collaborative filtering algorithm that uses KNN algorithm in predicting the recommendations. We use cosine similarity as the similarity score criterion.

####user_based or not? Whether similarities would be computed between users or between items. This has a huge impact on the performance of a prediction algorithm.


In [35]:
similarity_matrix = KNNBasic(sim_options={
        'name': 'cosine',
        'user_based': False
        })\
        .fit(trainset)\
        .compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [36]:
benchmark = []

# Perform cross validation
results = cross_validate(KNNBasic(), dataset, measures=['RMSE'], cv=3, verbose=False)

# Get results & append algorithm name
tmp = pd.DataFrame.from_dict(results).mean(axis=0)
tmp = tmp.append(pd.Series(['KNNBasic'], index=['Algorithm']))
benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBasic,0.95758,0.125785,2.596587


In [37]:
# Pick a random user ID, has to be a numeric string.
test_subject = '6'

# Get the top K items user rated
k = 20


In [38]:
# When using Surprise, there are RAW and INNER IDs.
# Raw IDs are the IDs, strings or numbers, you use when
# creating the trainset. The raw ID will be converted to
# an unique integer Surprise can more easily manipulate
# for computations.
#
# So in order to find an user inside the trainset, you
# need to convert their RAW ID to the INNER Id. Read
# Reference for more info https://surprise.readthedocs.io/en/stable/FAQ.html#what-are-raw-and-inner-ids
test_subject_iid = trainset.to_inner_uid(test_subject)

# Get the top K items we rated
test_subject_ratings = trainset.ur[test_subject_iid]
k_neighbors = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])

In [39]:
# Default dict is basically a standard dictionary,
# the difference being that it doesn't throw an error
# when trying to access a key which does not exist,
# instead a new entry, with that key, is created.
candidates = defaultdict(float)

for itemID, rating in k_neighbors:
    try:
      similaritities = similarity_matrix[itemID]
      for innerID, score in enumerate(similaritities):
          candidates[innerID] += score * (rating / 5.0)
    except:
      continue

In [40]:
# Build a dictionary of movies the user has watched
watched = {}
for itemID, rating in trainset.ur[test_subject_iid]:
  watched[itemID] = 1

# Add items to list of user's recommendations
# If they are similar to their favorite movies,
# AND have not already been watched.
recommendations = []

position = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
  if not itemID in watched:
    recommendations.append(trainset.to_raw_iid(itemID))
    position += 1
    if (position > 10): break # We only want top 10


In [41]:
recom = pd.DataFrame(recommendations,columns=['movieId'],dtype=int)

In [42]:
recom.astype(int)
pd.merge(df,recom,on='movieId')['title']

0                        Prefontaine (1997)
1                     Excess Baggage (1997)
2      Twin Peaks: Fire Walk with Me (1992)
3                 Where the Heart Is (2000)
4                         Black Rain (1989)
5                    Waterdance, The (1992)
6     Ferngully: The Last Rainforest (1992)
7                        American Me (1992)
8                         Open Range (2003)
9                     Running Scared (2006)
10                       Half Nelson (2006)
Name: title, dtype: object