In [2]:

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   903k      0  0:00:01  0:00:01 --:--:--  903k


In [None]:
! curl https://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [5]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('storage')

In [6]:

# import the dataset
import pandas as pd
movies_df = pd.read_csv('storage/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('storage/ml-latest-small/ratings.csv')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
ratings_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [7]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [8]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [11]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [12]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()
        
        #--- Producing new continuous IDs for users and movies ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        
        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        
        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [13]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0488, 0.0408, 0.0418,  ..., 0.0462, 0.0331, 0.0195],
        [0.0053, 0.0367, 0.0386,  ..., 0.0155, 0.0105, 0.0180],
        [0.0357, 0.0396, 0.0141,  ..., 0.0441, 0.0317, 0.0106],
        ...,
        [0.0189, 0.0187, 0.0452,  ..., 0.0271, 0.0464, 0.0267],
        [0.0401, 0.0068, 0.0366,  ..., 0.0285, 0.0253, 0.0047],
        [0.0155, 0.0417, 0.0467,  ..., 0.0211, 0.0485, 0.0239]])
item_factors.weight tensor([[0.0420, 0.0184, 0.0411,  ..., 0.0127, 0.0125, 0.0052],
        [0.0008, 0.0326, 0.0121,  ..., 0.0035, 0.0049, 0.0493],
        [0.0400, 0.0112, 0.0251,  ..., 0.0084, 0.0005, 0.0029],
        ...,
        [0.0445, 0.0107, 0.0161,  ..., 0.0381, 0.0454, 0.0191],
        [0.0159, 0.0079, 0.0308,  ..., 0.0199, 0.0199, 0.0209],
        [0.0162, 0.0230, 0.0308,  ..., 0.0491, 0.0077, 0.0432]])


In [14]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.066218980072719
iter #1 Loss: 4.749258429265869
iter #2 Loss: 2.476579010335322
iter #3 Loss: 1.7216437171558439
iter #4 Loss: 1.3459778394039512
iter #5 Loss: 1.1289005915676882
iter #6 Loss: 0.9919122158724645
iter #7 Loss: 0.9008760509000817
iter #8 Loss: 0.8375828553577365
iter #9 Loss: 0.7922175896122371
iter #10 Loss: 0.7596140323859181
iter #11 Loss: 0.7351420940027624
iter #12 Loss: 0.7159778365704614
iter #13 Loss: 0.7021359449186301
iter #14 Loss: 0.6907390734265904
iter #15 Loss: 0.6817682520462777
iter #16 Loss: 0.674828599249651
iter #17 Loss: 0.6698779625260285
iter #18 Loss: 0.665380589966544
iter #19 Loss: 0.6627944007440267
iter #20 Loss: 0.6605057429314265
iter #21 Loss: 0.6590990935863578
iter #22 Loss: 0.657449829623784
iter #23 Loss: 0.65649769368208
iter #24 Loss: 0.6558808977077455
iter #25 Loss: 0.6548779623081842
iter #26 Loss: 0.6542484811341702
iter #27 Loss: 0.653380969038167
iter #28 Loss: 0.6522510184098016
iter #29 Loss: 0.650912249829563

In [None]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.3214,  1.5825,  1.3713,  ...,  1.1841,  1.4606,  1.1374],
        [ 1.6893,  1.0929,  0.8832,  ...,  0.3918, -0.7899,  1.4821],
        [ 1.6611, -0.8462,  0.9309,  ...,  0.8558, -3.3696,  1.5857],
        ...,
        [ 0.7467, -0.0781,  2.7053,  ...,  2.0763,  0.2808,  0.8830],
        [ 1.0582,  0.9051,  0.8805,  ...,  0.5864,  0.9524,  1.3628],
        [ 1.2826,  1.0403,  0.1867,  ...,  1.4798,  0.6762,  0.9817]],
       device='cuda:0')
item_factors.weight tensor([[ 0.2436,  0.9746,  0.1323,  ...,  0.3817,  0.6255,  0.3122],
        [ 0.3879,  0.5479,  0.0877,  ..., -0.0916,  0.5248,  0.4416],
        [ 0.3732,  0.3830,  0.3910,  ...,  0.6118,  0.4117,  0.1701],
        ...,
        [ 0.3627,  0.3403,  0.3296,  ...,  0.3618,  0.3657,  0.3731],
        [ 0.4299,  0.4228,  0.3996,  ...,  0.4056,  0.3959,  0.4252],
        [ 0.3924,  0.3935,  0.4253,  ...,  0.3894,  0.4110,  0.4199]],
       device='cuda:0')


In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [None]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Titanic (1997)
	 Breakfast Club, The (1985)
	 Donnie Darko (2001)
	 Ferris Bueller's Day Off (1986)
	 Clueless (1995)
	 Four Weddings and a Funeral (1994)
	 Casablanca (1942)
	 While You Were Sleeping (1995)
	 Requiem for a Dream (2000)
	 Big (1988)
Cluster #1
	 Independence Day (a.k.a. ID4) (1996)
	 Speed (1994)
	 Mission: Impossible (1996)
	 Mask, The (1994)
	 Mrs. Doubtfire (1993)
	 Die Hard: With a Vengeance (1995)
	 Batman Forever (1995)
	 Twister (1996)
	 Rock, The (1996)
	 Home Alone (1990)
Cluster #2
	 Godzilla (1998)
	 Super Mario Bros. (1993)
	 Joe Dirt (2001)
	 Toys (1992)
	 Honey, I Blew Up the Kid (1992)
	 Nutty Professor II: The Klumps (2000)
	 Karate Kid, Part III, The (1989)
	 Dungeons & Dragons (2000)
	 Flintstones in Viva Rock Vegas, The (2000)
	 Stop! Or My Mom Will Shoot (1992)
Cluster #3
	 Net, The (1995)
	 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
	 Harry Potter and the Chamber of Secrets (2002)
	 W