In [None]:
# Import and load data here
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data from: https://grouplens.org/datasets/movielens/1m/
# Uploaded to drive and imported

!unzip /content/drive/MyDrive/Import\ to\ Colab/Datasets/ml-1m.zip

Archive:  /content/drive/MyDrive/Import to Colab/Datasets/ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [None]:
import numpy as np
import pandas as pd

In [None]:
movie_data = pd.read_csv('/content/ml-1m/movies.dat',
                        sep = "::",
                        names = ['MovieID', 'Title', 'Genres'],
                        header = None,
                        encoding = 'iso 8859-1')

  return func(*args, **kwargs)


In [None]:
user_data = pd.read_csv('/content/ml-1m/users.dat',\
                        sep = "::",\
                        names = ['UserID', 'Gender', 'Age', 'Occupation'],\
                        header = None, \
                        encoding = 'iso 8859-1')

In [None]:
rating_data = pd.read_csv('/content/ml-1m/users.dat',\
                        sep = "::",\
                        names = ['UserID', 'MovieID', 'Rating', 'TimeStamp'],\
                        header = None, \
                        encoding = 'iso 8859-1')

In [None]:
# To implement SVD (Singular Value Decomposition) 
# Optimised for recommender system

!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=3366451 sha256=06bad3e4a526ab613272cbc49189fd3c925fd1aa53f63f21b055ca53f7f0931b
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
# To implement ALS (Alternating Least Square)
# Enable GPU here, has implementation available.
# If you ran stuff before on CPU restart and run everything with GPU in colab.

# Careful when running in local enviromnent, use conda install.
# pip install may break python environment due to conflicts in dependencies.

!pip install implicit
# !conda install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp38-cp38-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [None]:
from surprise import SVD, KNNWithMeans
# KNNWithMeans not used in this notebook
# KNNWithMeans slower than SVD as an algorithm

from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

import implicit
from implicit.als import AlternatingLeastSquares
# Check AlternatingLeastSquare available in PySpark
# Useful implementation for distributed computing
# Some of the syntax may change.

In [None]:
# Surprise package cannot work with regular dataframes
# Will convert into internal data structure that surprise uses for calculation
# Since it is meant for recommender system it can only take 3 columns
# The names do not matter but the order of passing data is specific
# The arguments/data passed is understood as 'User, Item, Rating' in that order.

rating_data_surprise_package = rating_data[['UserID', 'MovieID', 'Rating']]

In [None]:
# User is required to mention the range/scale of the rating

reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(rating_data_surprise_package, reader)

In [None]:
algo_svd = SVD(n_factors=50, n_epochs=20)
algo_knn = KNNWithMeans(k=20)

In [None]:
# Surprise based on sk-learn but based on its own data structures

cross_validate(algo=algo_svd, \
               data = data, \
               measures = ["rmse", "mae"], \
               cv=5, \
               verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    7.2429  6.9337  6.9369  7.0973  7.1251  7.0672  0.1183  
MAE (testset)     5.8005  5.5033  5.5439  5.7227  5.7020  5.6545  0.1125  
Fit time          0.04    0.03    0.03    0.03    0.02    0.03    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([7.24288873, 6.93369735, 6.93686047, 7.09730151, 7.12506535]),
 'test_mae': array([5.80049669, 5.50331126, 5.54387417, 5.72268212, 5.70198675]),
 'fit_time': (0.03599834442138672,
  0.02501988410949707,
  0.025955677032470703,
  0.025440216064453125,
  0.02435922622680664),
 'test_time': (0.006449699401855469,
  0.006237983703613281,
  0.0062792301177978516,
  0.006139516830444336,
  0.005983114242553711)}

In [None]:
# Now we try to build model
# Have not done so yet, model gets destroyed once score obtained.

d = data.build_full_trainset()

In [None]:
algo_svd.fit(d)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f40aa884970>

In [None]:
rating_data[rating_data['UserID']==1]

Unnamed: 0,UserID,MovieID,Rating,TimeStamp


In [None]:
# Predicting for some MovieID that we checked was not present in the dataset

algo_svd.predict(uid=1, iid=2500)

Prediction(uid=1, iid=2500, r_ui=None, est=5, details={'was_impossible': False})

In [None]:
# Unique to make sure it is only rated once
# Picks up at present arbitrarily whichever one it comes across first.

In [None]:
rating_data['MovieID']

1        1
2       56
3       25
4       45
5       25
        ..
6036    25
6037    45
6038    56
6039    45
6040    25
Name: MovieID, Length: 6040, dtype: int64

In [None]:
# Two ways to do this:

# unique_movie_ids = rating_data['MovieID'].unique
unique_movie_ids = set(rating_data['MovieID'])
unique_movie_ids

{1, 18, 25, 35, 45, 50, 56}

In [None]:
# MovieIDs not rated by user 1

unique_movie_ids_user_1 = set(rating_data[rating_data['UserID']==1]['MovieID'])
unique_movie_ids_user_1

set()

In [None]:
# MovieIDs not rated by user 1

user_not_rated_movie_ids = unique_movie_ids - unique_movie_id_user_1

NameError: ignored

In [None]:
# df = pd.DataFrame(names = ['MovieID', 'est_Rating'])
D = []
for idx in user_not_rated_movie_ids:
    d1 = (idx, algo_svd.predict(uid=1, iid = idx)[3])
    D.append(d1)

In [None]:
df = pd.DataFrame.from_records(D, columns= ['MovieID', 'est_rating'])

In [None]:
df.sort_values('est_rating', ascending=False).head(10)

In [None]:
# Can increase epochs
# Can increase dimensions by changing n_factors
# This way results probably won't differ since everyone's results will be more accurate.

In [None]:
# For SVD we use tall structure
# For ALS we use the wide structure
# To go from tall to wide we use pivot table

# Doing so will take up lot more memory since there will be lot of missing values
# We have asked python to impute the missing values (the movies not rated?) with 0
# So lots of memory will be used to store 0s

In [None]:
ratings = pd.pivot_table(data = rating_data, \
                         index = 'UserID', \
                         columns = 'MovieID', \
                         values='Rating', \
                         fill_values=0)

In [None]:
# Change it to a CSR Matrix
# CSR matrix is the one where we do not store the zero entries
# Helps limit wastage of memory

In [None]:
ratings.index = ['UserID_'+str(x) for x in ratings.index]
ratings.columns = ['MovieID_'+str(x) for x in ratings.columns]

In [None]:
ratings.head()

In [None]:
# We call in CSR Matrix from scipy

In [None]:
train_users, train_x = ratings.index, np.array(ratings)

In [None]:
from scipy.sparse import csr_matrix, random

In [None]:
user_items = csr_matrix(train_x, dtype= np.float64)
user_items

In [None]:
# initialise a model
model = implicit.als.AlternativeLeastSquares(factors=50)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(user_items.T)
# Need transpose above since we have data in the order user/item

item_factors, user_factors = model.item_factors, model.user_factors
# Default 15 iterations

In [None]:
# ALS is fast since it is basically OLS
# here $$\beta$$ calculation is really fast since it it just (X^T.X)^-1.(X^T)
# ALS converges really fast as well

# Therefore we use ALS
# May be suboptimal in terms of results compared to SVD
# However it will be much faster since it is linear

# ALS may perform better than SVD even if SVD gives better RMSE value due to above

In [None]:
# 2000 is the movie ID we chose

recs = model.recommend(2000, user_items[2000], N=5)
recs

In [None]:
# Recommendation value may not match the integer values
# Not really a problem.
# I can just extract the top recommendation MovieID and recommend that.