# 03 - Knowledge Graph Building

In this notebook, we convert the TMDB movie dataset into a structured knowledge graph using NetworkX.


In [1]:
import pandas as pd
import networkx as nx

In [2]:
# Load cleaned dataset
df = pd.read_csv('../data/cleaned_tmdb_movies.csv')

# Evaluate stringified lists
df['genre_names'] = df['genre_names'].apply(eval)
df['top_actors'] = df['top_actors'].apply(eval)


In [3]:
# create graph

G = nx.MultiDiGraph()  # Directed graph with possible parallel edges


In [4]:
# Add Nodes and Edges
for _, row in df.iterrows():
    movie_id = row['id']
    title = row['title_x']

    # Add movie node
    G.add_node(movie_id, type='movie', title=title)

    # Add genres
    for genre in row['genre_names']:
        genre_id = f"genre_{genre}"
        G.add_node(genre_id, type='genre', name=genre)
        G.add_edge(movie_id, genre_id, relation='has_genre')

    # Add actors
    for actor in row['top_actors']:
        actor_id = f"actor_{actor}"
        G.add_node(actor_id, type='actor', name=actor)
        G.add_edge(actor_id, movie_id, relation='acted_in')

    # Add director
    director = row['director']
        # In case director was missing
    if director:
        director_id = f"director_{director}"
        G.add_node(director_id, type='director', name=director)
        G.add_edge(director_id, movie_id, relation='directed')


In [5]:
# Save the Graph

nx.write_graphml(G, "../data/movie_knowledge_graph.graphml")


# Visualization 

In [42]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
import numpy as np

# --- Target Director ---
target_director = "Christopher Nolan"
director_id = f"director_{target_director}"

# --- Build subgraph ---
movie_nodes = list(G.neighbors(director_id))
subgraph_nodes = [director_id] + movie_nodes

for movie_id in movie_nodes:
    neighbors = list(G.neighbors(movie_id))
    actors = [n for n in neighbors if G.nodes[n]['type'] == 'actor']
    genres = [n for n in neighbors if G.nodes[n]['type'] == 'genre']
    subgraph_nodes += actors[:1]  # Only top actor
    subgraph_nodes += genres     # All genres

subgraph_nodes = list(set(subgraph_nodes))
subG = G.subgraph(subgraph_nodes)


In [43]:
# --- Radial layout ---
pos = {}
angle_step = 2 * np.pi / len(movie_nodes)
radius = 1

pos[director_id] = (0, 0)
for i, movie_id in enumerate(movie_nodes):
    angle = i * angle_step
    pos[movie_id] = (radius * np.cos(angle), radius * np.sin(angle))

for node in subG.nodes():
    if node not in pos:
        pos[node] = (np.random.uniform(-2, 2), np.random.uniform(-2, 2))

# --- Node attributes ---
node_x, node_y = [], []
node_text = []   # for hover
label_text = []  # for visible labels
node_color = []
node_sizes = []

for node in subG.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

    ntype = subG.nodes[node]['type']
    label = subG.nodes[node].get('name') or subG.nodes[node].get('title') or node

    # Movies
    if ntype == 'movie':
        row = df[df['title_x'] == label]
        if not row.empty:
            year = pd.to_datetime(row['release_date'].values[0]).year
            rating = row['vote_average'].values[0]
            runtime = row['runtime'].values[0]
            hover = f"MOVIE: {label}<br>Year: {year}<br>Rating: {rating}/10<br>Runtime: {runtime} min"
        else:
            hover = f"MOVIE: {label}"
        label_text.append(label)
        node_color.append('skyblue')
        node_sizes.append(12)

    # Director
    elif ntype == 'director':
        hover = f"DIRECTOR: {label}<br>Directed {len(movie_nodes)} movies"
        label_text.append(label)
        node_color.append('lightgreen')
        node_sizes.append(20)

    # Genre
    elif ntype == 'genre':
        hover = f"GENRE: {label}"
        label_text.append("")
        node_color.append('violet')
        node_sizes.append(10)

    # Actor
    elif ntype == 'actor':
        hover = f"ACTOR: {label}"
        label_text.append("")
        node_color.append('orange')
        node_sizes.append(10)

    else:
        hover = f"{ntype.upper()}: {label}"
        label_text.append("")
        node_color.append('gray')
        node_sizes.append(8)

    node_text.append(hover)

# --- Edges ---
edge_x, edge_y = [], []
for edge in subG.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=1, color='gray'),
    hoverinfo='none',
    mode='lines'
)

# --- Node trace ---
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=label_text,         # Only show director/movie text
    hovertext=node_text,     # Hover details for all nodes
    hoverinfo='text',
    textposition="top center",
    marker=dict(
        size=node_sizes,
        color=node_color,
        line_width=1
    )
)

In [44]:
# --- Layout and Show ---
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title=dict(
                        text=f"Knowledge Graph: {target_director}",
                        font=dict(size=18)
                    ),
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=60),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False),
                    plot_bgcolor='white'
                ))

fig.show()