<a href="https://colab.research.google.com/github/SSGrady/Rec-System/blob/main/Recommendation_System_StevenGrady.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! curl https://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1782k      0 --:--:-- --:--:-- --:--:-- 1785k


##Movie Recommendation System: Collaborative Filtering + Kmeans + Matrix Factorization

In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
  zip_ref.extractall('data')

In [3]:
import pandas as pd
#|-- import the dataset --|

# dimensions of movies are (9742, 3)
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
# dimensions of ratings are (100836, 4)
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [4]:
# View movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
#View ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
# Mapping movie ID to movie name for each item
movies_names = movies_df.set_index('movieId')['title'].to_dict()

n_users = len(ratings_df.userId.unique())
m_items = len(ratings_df.movieId.unique())
matrix_nm = n_users * m_items

print("Number of unique users", n_users)
print("Number of unique items", m_items)
print("Full ratings matrix will contain", n_users*m_items, " elemnts")
print("Number of ratings is", ratings_df.shape[0])
print("So ",round(len(ratings_df)/matrix_nm*100, 2), "% of the matrix is full")
print("This means that we have an incredibly sparse dataset matrix")
print("To work with global scale, storing a full matrix in memory will be the challenge")
print("Matrix factorization can realize the rating matrix implicitly (don't need all data!)")

Number of unique users 610
Number of unique items 9724
Full ratings matrix will contain 5931640  elemnts
Number of ratings is 100836
So  1.7 % of the matrix is full
This means that we have an incredibly sparse dataset matrix
To work with global scale, storing a full matrix in memory will be the challenge
Matrix factorization can realize the rating matrix implicitly (don't need all data!)


In [7]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
  def __init__(self, n_users, m_items, n_factors=20):
    super().__init__()
    #create user embeddings - i.e. look up table for the input
    self.user_factors = torch.nn.Embedding(n_users, n_factors)
    # create item embeddings - i.e. look up tables
    self.item_factors = torch.nn.Embedding(m_items, n_factors)

    # tuneable parameters
    self.user_factors.weight.data.uniform_(0, 0.05)
    self.item_factors.weight.data.uniform_(0, 0.05)

  def forward(self, data):
    # matrix multiplication
    users, items = data[:,0], data[:,1]
    return (self.user_factors(users)*self.item_factors(items)).sum(1)

  def predict(self, user, item):
    return self.forward(user, item)

In [8]:
# Creating the dataloader for PyTorch - helps transform data to ML readiness
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

# An efficient and convenient way to manage/process data for training
class RatingsDataLoader(Dataset):
  def __init__(self):
      # not recommended but is possible since ratings_df is stored in memory
      self.ratings = ratings_df.copy()
      # Extract all user IDs and movie IDs
      users = ratings_df.userId.unique()
      movies = ratings_df.movieId.unique()

      #--- Producing new continuous IDs for users and movies ---

      # Unique values : index
      self.userid2idx = {o:i for i,o in enumerate(users)}
      self.movieid2idx = {o:i for i,o in enumerate(movies)}

      # Obtained continuous ID for users and movies
      self.idx2userid = {i:o for o,i in self.userid2idx.items()}
      self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

      # return the id from the indexed values as noted in the lambda function down below.
      self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
      self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


      self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
      self.y = self.ratings['rating'].values
      self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

  # returns number of smaples in the dataset
  def __len__(self):
    return len(self.ratings)
  # returns a specified sample at the given index
  def __getitem__(self, index):
    return self.c

In [10]:
num_empochs = 164
cuda = torch.cuda.is_available()

print("Is running on GPU: ", cuda)

model = MatrixFactorization(n_users, m_items, n_factors=12)
print("\n",model)

for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)

  if cuda:
    model = model.cuda()

  # MSE loss
  loss_fn = torch.nn.MSELoss()

  # ADAM optimizer
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

  # Train data
  train_set = RatingsDataLoader()
  train_loader = DataLoader(train_set, 164, shuffle=True)

Is running on GPU:  False

 MatrixFactorization(
  (user_factors): Embedding(610, 12)
  (item_factors): Embedding(9724, 12)
)
user_factors.weight tensor([[0.0225, 0.0466, 0.0468,  ..., 0.0394, 0.0133, 0.0312],
        [0.0157, 0.0400, 0.0372,  ..., 0.0488, 0.0461, 0.0208],
        [0.0247, 0.0041, 0.0017,  ..., 0.0044, 0.0252, 0.0489],
        ...,
        [0.0365, 0.0386, 0.0291,  ..., 0.0158, 0.0091, 0.0456],
        [0.0476, 0.0233, 0.0138,  ..., 0.0210, 0.0401, 0.0178],
        [0.0352, 0.0003, 0.0130,  ..., 0.0238, 0.0193, 0.0038]])
item_factors.weight tensor([[0.0014, 0.0242, 0.0429,  ..., 0.0474, 0.0453, 0.0101],
        [0.0070, 0.0188, 0.0155,  ..., 0.0035, 0.0292, 0.0208],
        [0.0301, 0.0290, 0.0032,  ..., 0.0139, 0.0039, 0.0275],
        ...,
        [0.0266, 0.0331, 0.0495,  ..., 0.0381, 0.0196, 0.0457],
        [0.0096, 0.0107, 0.0250,  ..., 0.0067, 0.0416, 0.0357],
        [0.0304, 0.0108, 0.0241,  ..., 0.0168, 0.0461, 0.0114]])
