# Graph embeddings and recommendation

Use Graph Embedding techniques to:

Represent movies, actors, genres, etc. as vectors

Use those vectors for similarity-based movie recommendation

Optionally visualize or cluster embedded nodes

In [10]:
import networkx as nx
import pandas as pd
import numpy as np
from karateclub import Node2Vec
from sklearn.metrics.pairwise import cosine_similarity
import random


In [11]:
# Load your full graph


G = nx.read_graphml("../data/movie_knowledge_graph.graphml")

G_simple = nx.DiGraph()
for u, v, data in G.edges(data=True):
    G_simple.add_edge(u, v)



In [12]:
# Convert to undirected

G_undirected = G_simple.to_undirected()




In [13]:
#  Sample 5,000-node subgraph
sample_nodes = random.sample(list(G_undirected.nodes()), 5000)
G_sub = G_undirected.subgraph(sample_nodes).copy()


In [14]:
# Relabel node names to integers

node2id = {node: i for i, node in enumerate(G_sub.nodes())}
id2node = {i: node for node, i in node2id.items()}
G_sub_mapped = nx.relabel_nodes(G_sub, node2id)


In [15]:
# Train Node2Vec on sampled graph

model = Node2Vec(
    dimensions=32,
    walk_number=3,
    walk_length=20,
    workers=1
)
model.fit(G_sub_mapped)


In [16]:
#  Create embedding_dict

embedding_array = model.get_embedding()
embedding_dict = {id2node[i]: embedding_array[i] for i in range(len(G_sub_mapped.nodes()))}


In [17]:
#Define the recommender function

def get_similar_movies(movie_title, top_n=5):
    # Find movie node in original graph
    movie_node = None
    for node in G.nodes:
        if G.nodes[node].get('type') == 'movie' and G.nodes[node].get('title', '').lower() == movie_title.lower():
            movie_node = node
            break

    if not movie_node or movie_node not in embedding_dict:
        print("Movie not found or not in embedding sample.")
        return pd.DataFrame()

    movie_vec = embedding_dict[movie_node].reshape(1, -1)
    other_movies = [n for n in embedding_dict if G.nodes[n].get('type') == 'movie' and n != movie_node]
    other_vecs = [embedding_dict[n] for n in other_movies]
    sim_scores = cosine_similarity(movie_vec, other_vecs).flatten()

    df = pd.DataFrame({
        'node': other_movies,
        'similarity': sim_scores
    })
    df['title'] = df['node'].apply(lambda n: G.nodes[n].get('title'))
    return df.sort_values(by='similarity', ascending=False).head(top_n)[['title', 'similarity']]


In [18]:
# Test the recommender

get_similar_movies("Inception", top_n=5)


Unnamed: 0,title,similarity
1016,Under Siege 2: Dark Territory,0.740849
235,The Walk,0.735049
1660,Blue Crush,0.70196
1150,Memento,0.694863
1567,Niagara,0.678289


In [20]:
# Visualize embeddings with TSNE

from sklearn.manifold import TSNE
import plotly.express as px

movie_nodes = [n for n in embedding_dict if G.nodes[n].get('type') == 'movie']
movie_vecs = [embedding_dict[n] for n in movie_nodes]
movie_labels = [G.nodes[n]['title'] for n in movie_nodes]

embedding_2d = TSNE(n_components=2, random_state=42).fit_transform(np.array(movie_vecs))
df_vis = pd.DataFrame(embedding_2d, columns=["x", "y"])
df_vis["title"] = movie_labels

px.scatter(df_vis, x="x", y="y", hover_name="title", title="Movie Embedding Space").show()


In [27]:
import pickle
import os

# Define the absolute path relative to notebook
save_path = os.path.abspath("../data/embedding_dict.pkl")

# Ensure folder exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save the file
with open(save_path, "wb") as f:
    pickle.dump(embedding_dict, f)

print(f"embedding_dict.pkl saved to {save_path}")


embedding_dict.pkl saved to /Users/sirishapadmasekhar/Desktop/tmdb-recommender-graphml/data/embedding_dict.pkl


In [4]:
import networkx as nx
G = nx.read_graphml("../data/movie_knowledge_graph.graphml")


In [6]:
movie_nodes = [n for n, attr in G.nodes(data=True) if attr.get("type") == "movie"]


In [7]:
from rdflib import Graph, URIRef, Literal, Namespace, RDF

rdf = Graph()
EX = Namespace("http://example.org/")

for node in movie_nodes[:100]:  # Sample 100 for simplicity
    node_data = G.nodes[node]
    title = node_data.get("title")
    director = node_data.get("director")
    
    movie_uri = URIRef(f"http://example.org/movie/{node}")
    rdf.add((movie_uri, RDF.type, EX.Movie))
    
    if title:
        rdf.add((movie_uri, EX.title, Literal(title)))
    if director:
        rdf.add((movie_uri, EX.directedBy, Literal(director)))

# Save as RDF Turtle format
rdf.serialize(destination="../data/sample_movie_graph.ttl", format="turtle")


<Graph identifier=N3da9686a83a6468c86a85ca2abb427ca (<class 'rdflib.graph.Graph'>)>