In [2]:
import pandas as pd
import numpy as np
import os
import urllib.parse
import urllib.request
import re
import time
import networkx as nx
import matplotlib.pyplot as plt

# Load MovieLens data
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
genome_scores = pd.read_csv('genome-scores.csv')
genome_tags = pd.read_csv('genome-tags.csv')

filliste = [
    ("links", links),
    ("movies", movies),
    ("ratings", ratings),
    ("tags", tags),
    ("genome_scores", genome_scores),
    ("genome_tags", genome_tags)
]

# Make folders for caching
os.makedirs("directors", exist_ok=True)
os.makedirs("actors", exist_ok=True)


In [10]:
def imdb_id_to_tt(imdb_id):
    """
    Convert imdbId from links.csv (int or string) to proper tt-code, e.g. 114709 -> 'tt0114709'.
    """
    s = str(imdb_id)
    # ensure 7 digits
    return f"tt{s.zfill(7)}"


def fetch_imdb_html(tt_id, delay=0.5):
    """
    Fetch HTML for a given tt-id from IMDB with a browser-like User-Agent.
    delay: sleep to be nicer to the server (seconds).
    """
    url = f"https://www.imdb.com/title/{tt_id}/"
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/120.0.0.0 Safari/537.36"
            ),
            "Accept-Language": "en-US,en;q=0.9",
        }
    )
    with urllib.request.urlopen(req) as resp:
        html = resp.read().decode("utf-8", errors="ignore")
    time.sleep(delay)
    return html


def parse_directors_actors_from_html(html):
    """
    Use the <meta name="description" ...> trick to get director(s) and actors.
    Returns (directors, actors) as lists of strings.
    """
    # Meta description
    meta_match = re.search(
        r'<meta name="description" content="([^"]+)"',
        html
    )
    if not meta_match:
        return [], []

    description = meta_match.group(1)

    # Directors: 'Directed by ... .'
    dir_match = re.search(r'Directed by (.*?)\.', description)
    directors = []
    if dir_match:
        directors_raw = dir_match.group(1)
        directors = [d.strip() for d in re.split(r',| and ', directors_raw) if d.strip()]

    # Actors: 'With ... .' (after the 'Directed by' sentence)
    act_match = re.search(r'With (.*?)(?:\.|;)', description)
    actors = []
    if act_match:
        actors_raw = act_match.group(1)
        actors = [a.strip() for a in re.split(r',| and ', actors_raw) if a.strip()]

    return directors, actors


def get_directors_actors_for_movie(movie_id, imdb_id, use_cache=True):
    """
    High-level helper:
    - if cached files exist in directors/actors, read them
    - else scrape IMDB and write cache
    Returns (directors, actors).
    """
    dir_path = f"directors/movie_{movie_id}_directors.txt"
    act_path = f"actors/movie_{movie_id}_actors.txt"

    # Try cache first
    if use_cache and os.path.exists(dir_path) and os.path.exists(act_path):
        with open(dir_path, encoding="utf-8") as f:
            directors = [line.strip() for line in f if line.strip()]
        with open(act_path, encoding="utf-8") as f:
            actors = [line.strip() for line in f if line.strip()]
        return directors, actors

    # Otherwise, scrape
    tt_id = imdb_id_to_tt(imdb_id)
    try:
        html = fetch_imdb_html(tt_id)
        directors, actors = parse_directors_actors_from_html(html)
    except Exception as e:
        print(f"Failed to fetch/parse for movieId {movie_id}, imdbId {imdb_id}: {e}")
        return [], []

    # Save cache
    with open(dir_path, "w", encoding="utf-8") as f:
        for d in directors:
            f.write(d + "\n")

    with open(act_path, "w", encoding="utf-8") as f:
        for a in actors:
            f.write(a + "\n")

    return directors, actors


In [11]:
# Average rating for each movie
movie_avg_rating = ratings.groupby("movieId")["rating"].mean()
movie_avg_rating.head()


movieId
1    3.893508
2    3.278179
3    3.171271
4    2.868395
5    3.076957
Name: rating, dtype: float64

In [12]:
# Merge movies and links to get imdbId + title together
movies_links = movies.merge(links, on="movieId", how="inner")

# ⚠️ Limit for testing to avoid hammering IMDB
# Remove .head(N) to go bigger later (but be careful)
N_MOVIES = 86537   # e.g. first 200 movies; adjust as you dare
subset = movies_links.head(N_MOVIES)

G = nx.MultiGraph()   # actors as nodes, movies as edges

for _, row in subset.iterrows():
    movie_id = int(row["movieId"])
    title = row["title"]
    imdb_id = row["imdbId"]

    if pd.isna(imdb_id):
        continue

    avg_rating = float(movie_avg_rating.get(movie_id, np.nan))

    # Get directors & actors for this movie via IMDB scraping
    directors, actors = get_directors_actors_for_movie(movie_id, imdb_id, use_cache=True)

    if not actors:
        # If no actors found, skip
        continue

    # Add actor nodes
    for actor in actors:
        if actor not in G:
            G.add_node(actor, node_type="actor", name=actor)

    # For each unordered pair of actors, add an edge representing this movie
    for i in range(len(actors)):
        for j in range(i + 1, len(actors)):
            a1 = actors[i]
            a2 = actors[j]
            G.add_edge(
                a1,
                a2,
                movieId=movie_id,
                title=title,
                avg_rating=avg_rating,
                directors=directors
            )

print("Graph built.")
print("Number of actor nodes:", G.number_of_nodes())
print("Number of edges (movie-based):", G.number_of_edges())


Failed to fetch/parse for movieId 720, imdbId 118114: HTTP Error 404: Not Found


KeyboardInterrupt: 

In [3]:
def plot_top_k_actors(G, k=10, figsize=(10, 8)):
    # All nodes are actors in this graph, but we'll still be explicit
    actor_degrees = [(n, G.degree(n)) for n in G.nodes()]
    actor_degrees_sorted = sorted(actor_degrees, key=lambda x: x[1], reverse=True)

    top_actors = [n for n, deg in actor_degrees_sorted[:k]]

    # Get all neighbors of the top actors
    neighbors = set()
    for a in top_actors:
        neighbors.update(G.neighbors(a))

    # Induced subgraph on top actors + neighbors
    sub_nodes = set(top_actors) | neighbors
    H = G.subgraph(sub_nodes).copy()

    # Layout
    plt.figure(figsize=figsize)
    pos = nx.spring_layout(H, seed=42)

    # Node sizes: bigger for top actors
    sizes = []
    colors = []
    for n in H.nodes():
        if n in top_actors:
            sizes.append(400)
            colors.append("tab:red")
        else:
            sizes.append(150)
            colors.append("tab:blue")

    # Draw
    nx.draw_networkx_nodes(H, pos, node_size=sizes, node_color=colors, alpha=0.8)
    nx.draw_networkx_edges(H, pos, alpha=0.4)
    # Label only top actors (to avoid total clutter)
    labels = {n: n for n in top_actors}
    nx.draw_networkx_labels(H, pos, labels=labels, font_size=9)

    plt.axis("off")
    plt.title(f"Top {k} actors by degree and their connections")
    plt.show()

# Example: plot top 10
plot_top_k_actors(G, k=10)


NameError: name 'G' is not defined

In [15]:
actor_a = "Tom Hanks"
actor_b = "Tim Allen"

if G.has_edge(actor_a, actor_b):
    print(f"Edges between {actor_a} and {actor_b}:")
    for key, data in G[actor_a][actor_b].items():
        print(f"  - Movie: {data['title']}, avg rating: {data['avg_rating']}, directors: {data['directors']}")
else:
    print("No edge between them in this subset.")


Edges between Tom Hanks and Tim Allen:
  - Movie: Toy Story (1995), avg rating: 3.8935076093890357, directors: ['John Lasseter']


In [17]:
import networkx as nx
import numpy as np

def analyze_actor_network(G_multi, top_k=20):
    """
    G_multi: your MultiGraph with actors as nodes and movie-edges.
    We'll create a simple Graph G_simple for metrics that don't support MultiGraph.
    """
    # Simple projection: multiple edges between same actors collapsed into one
    G = nx.Graph(G_multi)

    print("===== BASIC SIZE METRICS =====")
    print(f"Number of nodes (actors): {G.number_of_nodes():,}")
    print(f"Number of edges (actor–actor pairs, ignoring multi-edges): {G.number_of_edges():,}")
    print(f"Number of edges in original MultiGraph (with movie multiplicity): {G_multi.number_of_edges():,}")
    
    # Degree sequence
    degrees = [deg for _, deg in G.degree()]
    avg_degree = np.mean(degrees)
    max_degree = np.max(degrees)
    print("\n===== DEGREE METRICS =====")
    print(f"Average degree: {avg_degree:.3f}")
    print(f"Max degree: {max_degree}")
    
    # Top-k actors by degree
    print(f"\n===== TOP {top_k} ACTORS BY DEGREE =====")
    top_nodes = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:top_k]
    for name, deg in top_nodes:
        print(f"  {name:30s} deg = {deg}")
    
    # Connected components on simple graph
    print("\n===== CONNECTED COMPONENTS =====")
    comps = sorted(nx.connected_components(G), key=len, reverse=True)
    giant = G.subgraph(comps[0]).copy()
    print(f"Number of connected components: {len(comps)}")
    print(f"Size of giant component: {giant.number_of_nodes():,} nodes, {giant.number_of_edges():,} edges")
    print(f"Fraction of nodes in giant component: {giant.number_of_nodes()/G.number_of_nodes():.2f}")
    
    # Average clustering coefficient (needs simple graph)
    print("\n===== CLUSTERING =====")
    avg_clustering = nx.average_clustering(G)
    print(f"Average clustering coefficient: {avg_clustering:.4f}")
    
    # Degree assortativity (simple graph)
    print("\n===== ASSORTATIVITY =====")
    assort = nx.degree_assortativity_coefficient(G)
    print(f"Degree assortativity: {assort:.4f}")
    
    # Shortest-path metrics (giant component only)
    print("\n===== SHORTEST PATH METRICS (Giant Component) =====")
    try:
        apl = nx.average_shortest_path_length(giant)
        print(f"Average shortest path length: {apl:.3f}")
    except Exception as e:
        print("Could not compute exact average shortest path length:", e)
        print("Consider approximations or sampling nodes instead.")
    
    # Betweenness centrality on giant component
    print("\n===== CENTRALITY MEASURES =====")
    print("Computing betweenness centrality on the giant component...")
    if giant.number_of_nodes() > 5000:
        print("WARNING: Graph is large. Using k=500 approximation for speed.")
        bc = nx.betweenness_centrality(giant, k=500, seed=42)
    else:
        bc = nx.betweenness_centrality(giant)
    
    top_bc = sorted(bc.items(), key=lambda x: x[1], reverse=True)[:top_k]
    print(f"\nTop {top_k} by betweenness centrality:")
    for node, score in top_bc:
        print(f"  {node:30s} BC = {score:.6f}")
    
    # Degree centrality (on simple graph)
    print("\n===== DEGREE CENTRALITY =====")
    dc = nx.degree_centrality(G)
    top_dc = sorted(dc.items(), key=lambda x: x[1], reverse=True)[:top_k]
    for node, score in top_dc:
        print(f"  {node:30s} DC = {score:.6f}")
    
    print("\n===== DONE =====")


# Run it on your MultiGraph G
analyze_actor_network(G)


===== BASIC SIZE METRICS =====
Number of nodes (actors): 2,597
Number of edges (actor–actor pairs, ignoring multi-edges): 5,509
Number of edges in original MultiGraph (with movie multiplicity): 5,532

===== DEGREE METRICS =====
Average degree: 4.243
Max degree: 25

===== TOP 20 ACTORS BY DEGREE =====
  Cary Grant                     deg = 25
  Gene Hackman                   deg = 24
  Harvey Keitel                  deg = 21
  Robert Duvall                  deg = 21
  Sam Neill                      deg = 21
  Audrey Hepburn                 deg = 21
  Chazz Palminteri               deg = 20
  Hugh Grant                     deg = 20
  Robin Williams                 deg = 18
  Val Kilmer                     deg = 18
  Anthony Hopkins                deg = 18
  Jim Carrey                     deg = 18
  Sylvester Stallone             deg = 18
  Antonio Banderas               deg = 18
  Christian Slater               deg = 18
  Halle Berry                    deg = 18
  Tommy Lee Jones         