In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('/content/gdrive/My Drive/Ý tưởng cho môn AI (Phúc + Thành)/anime.csv')
ratings_df = pd.read_csv('/content/gdrive/My Drive/Ý tưởng cho môn AI (Phúc + Thành)/rating_complete.csv')

In [None]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (17562, 35) 
The dimensions of ratings dataframe are: (57633278, 3)


In [None]:
movies_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [None]:
ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [None]:
movie_names = movies_df.set_index('MAL_ID')['Name'].to_dict()
ratings_df['user_id'] += 1
n_users = len(ratings_df.user_id.unique())
n_items = len(ratings_df.anime_id.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')

Number of unique users: 310059
Number of unique movies: 16872
The full rating matrix will have: 5231315448 elements.
----------
Number of ratings: 57633278
Therefore:  1.1016976241039709 % of the matrix is filled.


In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [None]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.user_id.unique()
        movies = ratings_df.anime_id.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.anime_id = ratings_df.anime_id.apply(lambda x: self.movieid2idx[x])
        self.ratings.user_id = ratings_df.user_id.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [None]:
num_epochs = 2
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=2)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(310059, 2)
  (item_factors): Embedding(16872, 2)
)
user_factors.weight tensor([[0.0314, 0.0499],
        [0.0440, 0.0272],
        [0.0318, 0.0196],
        ...,
        [0.0008, 0.0331],
        [0.0286, 0.0224],
        [0.0053, 0.0163]])
item_factors.weight tensor([[0.0011, 0.0035],
        [0.0069, 0.0108],
        [0.0048, 0.0078],
        ...,
        [0.0442, 0.0370],
        [0.0267, 0.0252],
        [0.0393, 0.0278]])


In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7ff3ba5e5660>

In [None]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/2 [00:00<?, ?it/s]

iter #0 Loss: 5.328684061229936
iter #1 Loss: 1.8180454130634836


In [None]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.6887, 0.7088],
        [0.8120, 0.7950],
        [0.8489, 0.8368],
        ...,
        [0.8446, 0.8768],
        [0.9536, 0.9459],
        [0.8689, 0.8798]], device='cuda:0')
item_factors.weight tensor([[4.6391, 4.6156],
        [4.5724, 4.6352],
        [4.3698, 4.3770],
        ...,
        [0.1078, 0.1006],
        [0.0902, 0.0887],
        [0.1028, 0.0913]], device='cuda:0')


In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()


len(trained_movie_embeddings) # unique movie factor weights

16872

In [None]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=422, random_state=0).fit(trained_movie_embeddings)



In [None]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(422):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:8]:
    print("\t", mov[0])

Cluster #0
	 Abunai Sisters: Koko & Mika
	 Hi Gekiga Ukiyoe Senya Ichiya
	 Son O-gong gwa Byeoldeul-ui Jeonjaeng
	 Shuugou no Kangae
	 Din Dong
	 Mouretsu 50 Ototarou
	 Nekoronde Mite ne.
	 Mechakko Dotakon
Cluster #1
	 Coppelion
	 Pokemon Best Wishes! Season 2
	 Ikkitousen: Dragon Destiny Specials
	 Yarichin☆Bitch-bu
	 Otome wa Boku ni Koishiteru Special
	 Amaenaide yo!!: Yasumanaide yo!!
	 Sexfriend
	 D.C.III: Da Capo III
Cluster #2
	 Issho ni Sleeping: Sleeping with Hinako
	 Hanoka
	 Fashion
	 Nude Batter Tetsuo
	 Sign
	 Commercial War
	 Mukidashi no Mitsuko
	 Ame no Bus Stop-hen
Cluster #3
	 Yahari Ore no Seishun Love Comedy wa Machigatteiru.
	 Dragon Ball
	 Nanatsu no Taizai: Imashime no Fukkatsu
	 Akagami no Shirayuki-hime 2nd Season
	 White Album 2
	 Love Live! The School Idol Movie
	 Flip Flappers
	 One Piece Film: Gold
Cluster #4
	 Isekai Cheat Magician
	 Omiai Aite wa Oshiego, Tsuyoki na, Mondaiji.
	 Wagamama High Spec
	 Shounen Maid Kuuro-kun: Tenshi no Uta
	 Jingai-san no Y