<a href="https://colab.research.google.com/github/Olavo-B/node2vec_MALdataset/blob/main/Node2vec__MAL_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Using node2vec for anime recommendations with My Anime List database**

**Author**: Olavo Barros e Caio Von Rondow







## Preparing environment

In [None]:
import os
from collections import defaultdict
import math
import networkx as nx
import random
from tqdm import tqdm
from zipfile import ZipFile
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from community import community_louvain

## Getting and preparing data

Datasets used: [Anime Recommendation Database 2020](https://www.kaggle.com/hernan4444/anime-recommendation-database-2020)

### Importing files from google drive

In [None]:
!mkdir ~/.kaggle #create the .kaggle folder in your root directory
!echo '{"username":"olavoalvesbarros","key":"e1bf9c7933835b43262deffc6575bf85"}' > ~/.kaggle/kaggle.json #write kaggle API credentials to kaggle.json
!chmod 600 ~/.kaggle/kaggle.json  # set permissions
!pip install kaggle #install the kaggle library



In [None]:
!kaggle datasets download -d hernan4444/anime-recommendation-database-2020

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/kaggle/cli.py", line 70, in main
    out = args.func(**command_args)
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 1493, in dataset_download_cli
    self.dataset_download_files(dataset,
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 1447, in dataset_download_files
    self.download_file(response, outfile, quiet, not force)
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 1834, in download_file
    size = int(response.headers['Content-Length'])
  File "/usr/local/lib/python3.10/dist-packages/urllib3/_collections.py", line 258, in __getitem__
    val = self._container[key.lower()]
KeyError: 'content-length'


In [None]:
# This dataset only considers animes that the user has watched completely (watching_status==2) and gave it a score (score!=0)
ZipFile('/content/anime-recommendation-database-2020.zip','r').extract('rating_complete.csv')

# This dataset contain general information of every anime (17.562 different anime) like genre, stats, studio, etc
ZipFile('/content/anime-recommendation-database-2020.zip','r').extract('anime.csv')

BadZipFile: ignored

### Creating dataframes with exported data

In [None]:
animes = pd.read_csv('anime.csv')
ratings = pd.read_csv('rating_complete.csv')

In [None]:
ratings

### Treating data

In [None]:
# Excluding not interesting genres in both datasets
df = animes[animes.Genres == 'Hentai']
ratings = ratings.drop(ratings.index[ratings.anime_id.isin(df.MAL_ID)])
#animes = animes.drop(animes.index[animes.Genres == 'Hentai'])
#animes = animes.drop(animes.index[animes.Score == 'Unknown'])

# Saving a series of how many animes a single user rate and than geting just user
# with the mean nuber of ratings = 185.
# Done this due the high number of users and animes rated by them (the most active user
# has 15k rates)
df = ratings['user_id'].value_counts()
df = df[df == 185]
ratings = ratings[ratings.user_id.isin(df.keys())]

# Converting animes rating to float
#animes.Score = animes.Score.apply(lambda x:float(x))

In [None]:
print('Animes data shape: ', animes.shape)
print('Rating data shape: ', ratings.shape)

In [None]:
# Access functions

# Given an anime_id, retunrn its name
def getAnimeName(anime_id):
  return list(animes[animes.MAL_ID == anime_id].Name)[0]
# Given an anime name, return its id
def getAnimeId(anime_name):
  return list(animes[animes.Name == anime_name].MAL_ID)[0]

## Creating graph

- Nodes: animes with rating `>= min_rating`
- Edges: users that rating anime `x` and anime `y`

### Define edges weigh and nodes

In [None]:
min_rating = 6 #int(animes.Score.mean())
pair_frequency = defaultdict(int)
item_frequency = defaultdict(int)

# Filter instances where rating is greater than or equal to min_rating.
rated_animes = ratings[ratings.rating >= min_rating]
# Group instances by user.
animes_grouped_by_users = list(rated_animes.groupby("user_id"))
for group in tqdm(
    animes_grouped_by_users,
    position=0,
    leave=True,
    desc="Compute anime rating frequencies",
):
    # Get a list of animes rated by the user.
    current_animes = list(group[1]["anime_id"])

    for i in range(len(current_animes)):
        # How many times a single anime was rated
        item_frequency[current_animes[i]] += 1
        for j in range(i + 1, len(current_animes)):
            x = min(current_animes[i], current_animes[j])
            y = max(current_animes[i], current_animes[j])
            # How many times two animes x and y was rated by the same user z
            pair_frequency[(x, y)] += 1

For the weights of the edges, was used [pointwise mutual information](https://en.wikipedia.org/wiki/Pointwise_mutual_information), which is defined as:


\begin{align}
        \mathbf{pmi} (\mathbf{x;y}) &\equiv \log_{2}\left(\frac{p(x,y)}{p(x)p(y)}\right) \\
        &\equiv \log_{2}p(x,y) - ( \log_{2}p(x) + \log_{2}p(y) ) \\
        &\equiv \log_{2}\left(\frac{xy}{D}\right) - ( \log_{2}\left(\frac{x}{D}\right) + \log_{2}\left(\frac{y}{D}\right) ) \\
        &\equiv \log_{2}(xy) - \log_{2}x - \log_{2}y + \log_{2}D
    \end{align}

Where:

  * `xy` is how many users rated both movie `x` and movie `y` with >= `min_rating`.
  * `x` is how many users rated movie `x` >= `min_rating`.
  * `y` is how many users rated movie `y` >= `min_rating`.
  * `D` total number of movie ratings >= `min_rating`.


`min_weight` is used to  reduce of node degrees, so a edge is created only if its weight `>= min_weight`


In [None]:
min_weight = 20
D = math.log(sum(item_frequency.values()))

# Create the movies undirected graph.
animes_graph = nx.Graph()
# Add weighted edges between movies.
# This automatically adds the movie nodes to the graph.
for pair in tqdm(
    pair_frequency, position=0, leave=True, desc="Creating the movie graph"
):
    x, y = pair
    xy_frequency = pair_frequency[pair]
    x_frequency = item_frequency[x]
    y_frequency = item_frequency[y]
    pmi = math.log(xy_frequency) - math.log(x_frequency) - math.log(y_frequency) + D
    weight = pmi * xy_frequency
    # Only include edges with weight >= min_weight.
    if weight >= min_weight:
        animes_graph.add_edge(x, y, weight=weight)

In [None]:
print("Total number of graph nodes:", animes_graph.number_of_nodes())
print("Total number of graph edges:", animes_graph.number_of_edges())

In [None]:
degrees = []
for node in animes_graph.nodes:
    degrees.append(animes_graph.degree[node])

print("Average node degree:", round(sum(degrees) / len(degrees), 2))

### Creating a vocabulary

In [None]:
vocabulary = ["NA"] + list(animes_graph.nodes)
vocabulary_lookup = {token: idx for idx, token in enumerate(vocabulary)}

## Random walk

![Example of bias factor for return (red) and in-out (yellow) edges. (Image by linkedin.com/in/remy-liu-a24780213/)](https://miro.medium.com/max/700/1*hdhB2HLkkA9toVjRVbScBw.png)

Cada caminhada pode ser entendida como uma sentença, se comparado com o Word2Vec

In [None]:
def next_step(graph, previous, current, p, q):
    neighbors = list(graph.neighbors(current))

    weights = []
    # Adjust the weights of the edges to the neighbors with respect to p and q.
    for neighbor in neighbors:
        if neighbor == previous:
            # Control the probability to return to the previous node.
            weights.append(graph[current][neighbor]["weight"] / p)
        elif graph.has_edge(neighbor, previous):
            # The probability of visiting a local node.
            weights.append(graph[current][neighbor]["weight"])
        else:
            # Control the probability to move forward.
            weights.append(graph[current][neighbor]["weight"] / q)

    # Compute the probabilities of visiting each neighbor.
    weight_sum = sum(weights)
    probabilities = [weight / weight_sum for weight in weights]
    # Probabilistically select a neighbor to visit.
    next = np.random.choice(neighbors, size=1, p=probabilities)[0]
    return next


def random_walk(graph, num_walks, num_steps, p, q):
    walks = []
    nodes = list(graph.nodes())
    # Perform multiple iterations of the random walk.
    for walk_iteration in range(num_walks):
        random.shuffle(nodes)

        for node in tqdm(
            nodes,
            position=0,
            leave=True,
            desc=f"Random walks iteration {walk_iteration + 1} of {num_walks}",
        ):
            # Start the walk with a random node from the graph.
            walk = [node]
            # Randomly walk for num_steps.
            while len(walk) < num_steps:
                current = walk[-1]
                previous = walk[-2] if len(walk) > 1 else None
                # Compute the next node to visit.
                next = next_step(graph, previous, current, p, q)
                walk.append(next)
            # Replace node ids (movie ids) in the walk with token ids.
            walk = [vocabulary_lookup[token] for token in walk]
            # Add the walk to the generated sequence.
            walks.append(walk)

    return walks


In [None]:
# Random walk return parameter.
p = 0.5
# Random walk in-out parameter.
# Small q value -> getting out the neighborhood
# Large q value -> getting in the neighborhood
q = 3
# Number of iterations of random walks.
num_walks = 5
# Number of steps of each random walk.
num_steps = 10
walks = random_walk(animes_graph, num_walks, num_steps, p, q)

print("Number of walks generated:", len(walks))

## Skip-Gram

![Resumo do Skip-Gram](http://mccormickml.com/assets/word2vec/skip_gram_net_arch.png)

O principal objetivo de se fazer um skip-gram é **descobrir a hidden layer (node embedding)** que possui os contextos para cada um dos nodos de dado grafo G. Assim, é possível fazer a similaridade de tais nodos, já que similares possuem contextos parecidos (com exemplo, no Word2vec, de palavras sinonimas)

### Creating the model

In [None]:
def generate_examples(sequences, window_size, num_negative_samples, vocabulary_size):
    example_weights = defaultdict(int)
    # Iterate over all sequences (walks).
    for sequence in tqdm(
        sequences,
        position=0,
        leave=True,
        desc=f"Generating postive and negative examples",
    ):
        # Generate positive and negative skip-gram pairs for a sequence (walk).
        # labels -> 1 or 0, mark if a pair is a negative or positive exemple
        # For more: https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/skipgrams
        pairs, labels = keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size=vocabulary_size,
            window_size=window_size,
            negative_samples=num_negative_samples,
        )
        for idx in range(len(pairs)):
            pair = pairs[idx]
            label = labels[idx]
            target, context = min(pair[0], pair[1]), max(pair[0], pair[1])
            if target == context:
                continue
            entry = (target, context, label)
            example_weights[entry] += 1

    targets, contexts, labels, weights = [], [], [], []
    for entry in example_weights:
        weight = example_weights[entry]
        target, context, label = entry
        targets.append(target)
        contexts.append(context)
        labels.append(label)
        weights.append(weight)

    return np.array(targets), np.array(contexts), np.array(labels), np.array(weights)


# num_negative_samples -> the proportion of negative samples by 1 positive sample
num_negative_samples = 4
targets, contexts, labels, weights = generate_examples(
    sequences=walks,
    window_size=num_steps,
    num_negative_samples=num_negative_samples,
    vocabulary_size=len(vocabulary),
)

In [None]:
print(f"Targets shape: {targets.shape}")
print(f"Contexts shape: {contexts.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Weights shape: {weights.shape}")

In [None]:
# for more https://stats.stackexchange.com/questions/153531/what-is-batch-size-in-neural-network
batch_size = 1024


# Creating a dataset that fit the neural network parameters
def create_dataset(targets, contexts, labels, weights, batch_size):
    inputs = {
        "target": targets,
        "context": contexts,
    }
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels, weights))
    dataset = dataset.shuffle(buffer_size=batch_size * 2)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


dataset = create_dataset(
    targets=targets,
    contexts=contexts,
    labels=labels,
    weights=weights,
    batch_size=batch_size,
)

In [None]:
learning_rate = 0.0001
embedding_dim = 300
num_epochs = 15

In [None]:
def create_model(vocabulary_size, embedding_dim):

    inputs = {
        "target": layers.Input(name="target", shape=(), dtype="int32"),
        "context": layers.Input(name="context", shape=(), dtype="int32"),
    }
    # Initialize item embeddings.
    embed_item = layers.Embedding(
        input_dim=vocabulary_size,
        output_dim=embedding_dim,
        embeddings_initializer="he_normal",
        embeddings_regularizer=keras.regularizers.l2(1e-6),
        name="item_embeddings",
    )
    # Lookup embeddings for target.
    target_embeddings = embed_item(inputs["target"])
    # Lookup embeddings for context.
    context_embeddings = embed_item(inputs["context"])
    # Compute dot similarity between target and context embeddings.
    logits = layers.Dot(axes=1, normalize=False, name="dot_similarity")(
        [target_embeddings, context_embeddings]
    )
    # Create the model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

### Training the model:

In [None]:
model = create_model(len(vocabulary), embedding_dim)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
)

In [None]:
keras.utils.plot_model(
    model, show_shapes=True, show_dtype=True, show_layer_names=True,
)

In [None]:
history = model.fit(dataset, epochs=num_epochs)

In [None]:
plt.plot(history.history["loss"])
plt.ylabel("loss")
plt.xlabel("epoch")
plt.show()

In [None]:
# Getting the hidden layer of the skip-gram model
movie_embeddings = model.get_layer("item_embeddings").get_weights()[0]
print("Embeddings shape:", movie_embeddings.shape)

## Visualization

In [None]:
import io

out_v = io.open("embeddings.tsv", "w", encoding="utf-8")
out_m = io.open("metadata.tsv", "w", encoding="utf-8")

for idx, anime_id in enumerate(vocabulary[1:]):
    movie_title = list(animes[animes.MAL_ID == anime_id].Name)[0]
    vector = movie_embeddings[idx]
    out_v.write("\t".join([str(x) for x in vector]) + "\n")
    out_m.write(movie_title + "\n")

out_v.close()
out_m.close()

In [None]:
query_movies = [
    "Gosick",
    "K-On!",
    "Kimi no Na wa.",
    "Sword Art Online",
    "Dragon Ball Z",
]

In [None]:
query_embeddings = []


for movie_title in query_movies:
    movieId = getAnimeId(movie_title)
    token_id = vocabulary_lookup[movieId]
    movie_embedding = movie_embeddings[token_id]
    query_embeddings.append(movie_embedding)

query_embeddings = np.array(query_embeddings)
len(movie_embedding)

## Previsão e valores do top k = 10

In [None]:
similarities = tf.linalg.matmul(
    tf.math.l2_normalize(query_embeddings),
    tf.math.l2_normalize(movie_embeddings),
    transpose_b=True,
)

values, indices = tf.math.top_k(similarities, k=10)
indices = indices.numpy().tolist()

values

In [None]:
for idx, title in enumerate(query_movies):
    print(title)
    print("".rjust(len(title), "-"))
    similar_tokens = indices[idx]
    for token in similar_tokens:
        similar_movieId = vocabulary[token]
        similar_title = getAnimeName(similar_movieId)
        print(f"- {similar_title}")
    print()

##**Source**

https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/graph/ipynb/node2vec_movielens.ipynb#scrollTo=GLh4uGvnOKSN