Imports & Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pickle
import requests
import os

seed = 42

download = False	# Flag to download or create the graph from local file
graph_url = "https://github.com/SorenLKuhl/imdb_network/raw/refs/heads/main/imdb_movie_network.pkl"

# Download backboning script
bb_url = "https://raw.githubusercontent.com/SorenLKuhl/RockBandWikiData/refs/heads/main/backboning.txt"
bb_response = requests.get(bb_url)
bb_response.raise_for_status()
with open("backboning.py", 'w') as f:
    f.write(bb_response.text)

import backboning

We load the tsv containing the movies of interest.

In [45]:
if download:
	print("Do nothing for now - download graph from GitHub later")
else:
	path = "../datasets/final_filtered_movies_with_reviews.tsv"
	movies_df = pd.read_csv(path, sep="\t")
	print(f"Number of movies: {len(movies_df)}")
	print(movies_df.head())

Number of movies: 2638
      tconst                     primaryTitle  averageRating  numVotes  \
0  tt0012349                          The Kid            8.2    143593   
1  tt0013442  Nosferatu: A Symphony of Horror            7.8    121536   
2  tt0015864                    The Gold Rush            8.1    126386   
3  tt0017136                       Metropolis            8.3    197727   
4  tt0017925                      The General            8.1    104523   

                           originalTitle  \
0                                The Kid   
1  Nosferatu, eine Symphonie des Grauens   
2                          The Gold Rush   
3                             Metropolis   
4                            The General   

                                              actors  \
0  Jackie Coogan, Carl Miller, Edna Purviance, Er...   
1  Fritz Arno Wagner, Gustav Botz, Henrik Galeen,...   
2  Roland Totheroh, Henry Bergman, Georgia Hale, ...   
3  Alfred Abel, Fritz Rasp, Erwin Biswanger

Now we create an undirected networkx graph with the movies as nodes and actors as links. We save the accompanying data as attributes on the nodes.

In [None]:
if download:
	# Download the pickled graph from GitHub
	response = requests.get(graph_url)
	response.raise_for_status()  # Ensure we got a successful response
	G = pickle.loads(response.content)
else:
	G: nx.Graph = nx.Graph()	# Undirected as links are actors in common

	# Create mapping of actors to movies
	actor_to_movies = {}

	for idx, row in movies_df.iterrows():

		# Add movie as node with attributes
		id = row['tconst']
		title = row['primaryTitle']
		original_title = row['originalTitle']
		avg_rating = row['averageRating']
		num_ratings = row['numVotes']
		actors = row['actors'].split(', ')
		reviews = row['top10_reviews'].split('|||') if 'top10_reviews' in row and pd.notna(row['top10_reviews']) else []
		genres = row['genres'].split(', ') if 'genres' in row and pd.notna(row['genres']) else []
		G.add_node(id, title=title, original_title=original_title, avg_rating=avg_rating, num_ratings=num_ratings, reviews=reviews, genres=genres)

		# Map actors to movies
		for actor in actors:
			if actor not in actor_to_movies:
				actor_to_movies[actor] = []
			actor_to_movies[actor].append(id)

	# Add edges based on shared actors
	for actor, movies in actor_to_movies.items():
		for i in range(len(movies)):
			for j in range(i + 1, len(movies)):
				# check if edge already exists, if so, append actor to list
				if G.has_edge(movies[i], movies[j]):
					G[movies[i]][movies[j]]['actors'].append(actor)
					G[movies[i]][movies[j]]['weight'] = len(G[movies[i]][movies[j]]['actors'])
				else:
					G.add_edge(movies[i], movies[j], actors=[actor], weight=1)

# Display basic info about the graph - should have num of node = num of movies
print(G)

Graph with 2638 nodes and 72826 edges


To ensure the mapping works as intended, we check the movies that are linked by Leonardo DiCaprio to see if the list is as expected.

In [47]:
test_name = "Leonardo DiCaprio"
linked_movies = set()
for movie1, movie2, data in G.edges(data=True):
	if test_name in data['actors']:
		linked_movies.add(G.nodes[movie1]['title'])
		linked_movies.add(G.nodes[movie2]['title'])
print(f"Movies linked by {test_name}: {linked_movies}")

Movies linked by Leonardo DiCaprio: {'The Wolf of Wall Street', 'Titanic', 'Inception', 'The Departed'}


We add genres from updated file

In [51]:
if not download:
	path = "../datasets/final_filtered_movies_with_genres.tsv"
	movies_df = pd.read_csv(path, sep="\t")
	print(f"Number of movies: {len(movies_df)}")
	print(movies_df.head())

Number of movies: 2638
      tconst                     primaryTitle  averageRating  numVotes  \
0  tt0012349                          The Kid            8.2    143593   
1  tt0013442  Nosferatu: A Symphony of Horror            7.8    121536   
2  tt0015864                    The Gold Rush            8.1    126386   
3  tt0017136                       Metropolis            8.3    197727   
4  tt0017925                      The General            8.1    104523   

                           originalTitle  \
0                                The Kid   
1  Nosferatu, eine Symphonie des Grauens   
2                          The Gold Rush   
3                             Metropolis   
4                            The General   

                                              actors                   genres  
0  Jackie Coogan, Carl Miller, Edna Purviance, Er...      Comedy,Drama,Family  
1  Fritz Arno Wagner, Gustav Botz, Henrik Galeen,...           Fantasy,Horror  
2  Roland Totheroh, Henry B

In [53]:
if not download:
    for idx, row in movies_df.iterrows():
        id = row["tconst"]
        genres = row["genres"].split(', ') if 'genres' in row and pd.notna(row['genres']) else []
        G.nodes[id]["genres"] = genres

# test genres 
for movie in list(G.nodes)[:5]:
    print(f"Movie: {G.nodes[movie]['title']}, Genres: {G.nodes[movie]['genres']}")

Movie: The Kid, Genres: ['Comedy,Drama,Family']
Movie: Nosferatu: A Symphony of Horror, Genres: ['Fantasy,Horror']
Movie: The Gold Rush, Genres: ['Adventure,Comedy,Drama']
Movie: Metropolis, Genres: ['Drama,Sci-Fi']
Movie: The General, Genres: ['Action,Adventure,Comedy']


Pickle G to upload to GitHub for easy download from other devices.

In [43]:
if not download:
	pickle_path = "imdb_movie_network.pkl"
	with open(pickle_path, 'wb') as f:
		pickle.dump(G, f)

We see that only 4 movies are found, which makes sense as the "actors" field of the data is created from the actor's "known for" movies on imdb which is max 4.

We now extract the network backbone:

In [None]:
# We use the weight attribute (number of shared actors) to extract the backbone of the network


KeyboardInterrupt: 