### This notebook extracts each movie's metadata from the IMDb website.
### The metadata includes the directors and actors.

In [None]:
!pip install -q PyMovieDb==0.0.9 lxml==5.3.0 lxml_html_clean==0.4.1

In [None]:
from urllib.request import urlretrieve
from zipfile import ZipFile
from PyMovieDb import IMDB
import pandas as pd
import json
import os

In [None]:
# Download "ml-latest-small"
urlretrieve("https://files.grouplens.org/datasets/movielens/ml-latest-small.zip", "ml-latest-small.zip")
ZipFile("ml-latest-small.zip", "r").extractall()
os.remove("ml-latest-small.zip")

#### NOTE: This is a time-consuming process. The currently extracted metadata is available in the 'ml-latest-small' folder.
#### NOTE: Extracting metadata for all movies was not successful.
#### You can skip running the cells below.

In [None]:
# Extract the metadata by provided IMDb links in "links.csv' and PyMovieDb library
links_df = pd.read_csv("./ml-latest-small/links.csv", dtype={"imdbId": str})

imdb = IMDB()

movies = []
directors = []
actors = []

list_404 = set()

for index, row in links_df.iterrows():
    movie_id = int(row['movieId'])
    imdb_id = "tt" + row['imdbId']

    res = imdb.get_by_id(imdb_id)
    res = json.loads(res)
    
    if "status" in res and res["status"] == 404:
        list_404.add(imdb_id)
        continue
    
    movies.append(movie_id)
    
    director_list = [item["name"] for item in res["director"]]
    directors.append("|".join(director_list))
        
    actor_list = [item["name"] for item in res["actor"]]
    actors.append("|".join(actor_list))
    
    if index % 1000 == 0:
        print(index)
    
print("Number of failures:", len(list_404))

In [None]:
# Save the metadata
movies_info = pd.DataFrame({"movieId":movies, "directors":directors, "actors":actors})
movies_info.to_csv("./ml-latest-small/movies_metadata.csv")