In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None

#pip install duckdb pandas numpy matplotlib requests python-dotenv jupysql duckdb-engine scikit-learn fastapi


In [2]:
# Parameters
product = {"nb": "/Users/thiago/workspace/duckDB/8.fastapi/movie_rec_system/products/extract-pipeline.ipynb", "data": "/Users/thiago/workspace/duckDB/8.fastapi/movie_rec_system/movies_data.duckdb"}


In [3]:
# flake8: noqa
import duckdb
import requests
import os
from dotenv import load_dotenv

In [4]:
def init_duck_db_movies(duckdb_file_path, res):
    """
    Create table for movies API call in DuckDB
    If the table exists, new data is inserted

    Parameters
    ----------
    duckdb_file_path : str
        Path to the DuckDB database file
    res : requests object
        API call results
    """
    try:
        conn = duckdb.connect(duckdb_file_path, read_only=False)

        tables = conn.execute("SHOW TABLES;").fetchall()
        if ("movies",) not in tables:
            conn.execute(
                """
                CREATE TABLE movies (
                    genre_ids INT[],
                    id INTEGER,
                    original_language VARCHAR,
                    overview VARCHAR,
                    popularity DOUBLE,
                    release_date TIMESTAMP,
                    title VARCHAR,
                    vote_average DOUBLE,
                    vote_count INTEGER
                );
            """
            )

        for movie in res["results"]:
            genre_ids_str = ",".join(map(str, movie["genre_ids"]))
            conn.execute(
                f"""
                INSERT INTO movies VALUES (ARRAY[{genre_ids_str}], {movie['id']},
                '{movie['original_language']}',
                '{movie['overview'].replace("'", "''")}',
                {movie['popularity']},
                '{movie['release_date']}',
                '{movie['title'].replace("'", "''")}',
                {movie['vote_average']},
                {movie['vote_count']});
            """
            )

        conn.close()

    except Exception as e:
        print(e)

In [5]:
def init_duck_db_genres(duckdb_file_path, genres_data):
    """
    Create table for genres API call in DuckDB
    If the table exists, new data is inserted

    Parameters
    ----------
    duckdb_file_path : str
        Path to the DuckDB database file
    genres_data : list
        List of genres

    """
    try:
        conn = duckdb.connect(duckdb_file_path, read_only=False)

        tables = conn.execute("SHOW TABLES;").fetchall()
        if ("genres",) not in tables:
            conn.execute(
                """
                CREATE TABLE genres (
                    id INTEGER,
                    name VARCHAR
                );
            """
            )

        for genre in genres_data:
            conn.execute(
                f"""
                INSERT INTO genres VALUES ({genre['id']},
                '{genre['name']}');
            """
            )

        conn.close()

    except Exception as e:
        print(e)

In [6]:
def drop_existing_movies_table(duckdb_file_path):
    """
    Drops existing movies tables

    Parameters
    ----------
    duckdb_file_path : str
        Path to the DuckDB database file
    """
    try:
        conn = duckdb.connect(duckdb_file_path, read_only=False)

        movies_table_exists = conn.execute(
            "SELECT 1 FROM information_schema.tables WHERE table_name = 'movies'"
        ).fetchone()

        if movies_table_exists:
            conn.execute("DROP TABLE movies;")
            print("Table 'movies' dropped.")
        else:
            print("Table 'movies' does not yet exist. Creating 'movies' now.")

        conn.close()

    except Exception as e:
        print(e)

In [7]:
def drop_existing_genres_table(duckdb_file_path):
    """
    Drops existing genres table

    Parameters
    ----------
    duckdb_file_path : str
        Path to the DuckDB database file
    """
    try:
        conn = duckdb.connect(duckdb_file_path, read_only=False)

        genres_table_exists = conn.execute(
            "SELECT 1 FROM information_schema.tables WHERE table_name = 'genres'"
        ).fetchone()

        if genres_table_exists:
            conn.execute("DROP TABLE genres;")
            print("Table 'genres' dropped.")
        else:
            print("Table 'genres' does not yet exist. Creating 'genres' now.")

        conn.close()

    except Exception as e:
        print(e)

In [8]:
def get_movies(lang, freq, duckdb_file_path, api_key):
    """
    Inserts API call results into DuckDB

    Parameters
    ----------
    lang : str
        Language of movies
    freq : int
        Amount of movies to extract
    duckdb_file_path : str
        Path to the DuckDB database file
    api_key : str
        API key for The Movie Database
    """
    url = "https://api.themoviedb.org/3/movie/popular?api_key={api_key}&with_original_language={lang}".format(  # noqa E501
        api_key=api_key, lang=lang
    )
    movies = 0
    page = 1
    progress = 0

    drop_existing_movies_table(duckdb_file_path)

    while movies < freq:
        try:
            res = requests.get(url + "&page=" + str(page))
        except requests.exceptions.RequestException as e:
            print("An error occurred during the request:", e)
            break
        if res.status_code != 200:
            print("error")
            return []

        res = res.json()

        if "errors" in res.keys():
            print("api error !!!")
            return movies

        movies += len(res["results"])

        init_duck_db_movies(duckdb_file_path, res)

        if progress != round(movies / freq * 100):
            progress = round(movies / freq * 100)
            if progress % 5 == 0:
                print(progress, end="%, ")

        page = page + 1
    return movies

In [9]:
def get_genres(lang, duckdb_file_path, api_key):
    """
    Inserts API call results into DuckDB

    Parameters
    ----------
    lang : str
        Language of movies
    freq : int
        Amount of movies to extract
    duckdb_file_path : str
        Path to the DuckDB database file
    api_key : str
        API key for The Movie Database
    """
    url = "https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&with_original_language={lang}".format(  # noqa E501
        api_key=api_key, lang=lang
    )

    drop_existing_genres_table(duckdb_file_path)

    try:
        res = requests.get(url)
    except requests.exceptions.RequestException as e:
        print("An error occurred during the request:", e)
        return []

    if res.status_code != 200:
        print("error")
        return []

    res = res.json()

    if "errors" in res.keys():
        print("api error !!!")
        return []

    genres_data = res["genres"]
    init_duck_db_genres(duckdb_file_path, genres_data)

    return len(genres_data)

In [10]:
if __name__ == "__main__":
    # Parameter to get 500 English movies
    language_count = {
        "en": 1000,
    }

    load_dotenv(".env")
    api_key = os.getenv("API_KEY")

    for key in language_count:
        # print(key,language_count[key])
        print("Downloading", key, end=": ")
        movies = get_movies(
            key, language_count[key], "movie_rec_system/movies_data.duckdb", api_key
        )  # noqa E501
        print("Total movies found:", movies)
        genres = get_genres(key, "movie_rec_system/movies_data.duckdb", api_key)
        print("Total genres found:", genres)

Downloading en: Table 'movies' does not yet exist. Creating 'movies' now.


10%, 

20%, 

30%, 

Conversion Error: invalid timestamp field format: "", expected format is (YYYY-MM-DD HH:MM:SS[.US][±HH[:MM[:SS]]| ZONE])

LINE 6:                 '',
                        ^


40%, 

50%, 

60%, 

70%, 

80%, 

Conversion Error: invalid timestamp field format: "", expected format is (YYYY-MM-DD HH:MM:SS[.US][±HH[:MM[:SS]]| ZONE])

LINE 6:                 '',
                        ^
90%, 

Conversion Error: invalid timestamp field format: "", expected format is (YYYY-MM-DD HH:MM:SS[.US][±HH[:MM[:SS]]| ZONE])

LINE 6:                 '',
                        ^
100%, Total movies found: 1000
Table 'genres' does not yet exist. Creating 'genres' now.


Total genres found: 19
