In [1]:
import os
import shutil

import duckdb

In [2]:

duckdb.sql("""SET memory_limit = '16GB';""")

In [3]:

duckdb.sql("""
    COPY 
        (SELECT *, replace(title, '/', '_') AS p_title FROM 'data/landing_zone/revenues_per_day.csv') 
    TO 'data/transformed/revenues_per_day' 
    (FORMAT PARQUET, PARTITION_BY (p_title))"""
    )


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [4]:
from datetime import datetime
from datetime import date

functions = [r[0] for r in duckdb.sql("""
SELECT function_name
FROM duckdb_functions()
WHERE function_type = 'scalar'
""").fetchall() ]


def split_to_list(s: str) -> list[str]:
    return [el.strip() for el in s.split(",")]
if 'split_to_list' not in functions:
    duckdb.create_function("split_to_list", split_to_list)


def date_transform(d: str) -> str:
    try:
        date_object = datetime.strptime(d, "%d %b %Y")
        formatted_date = date_object.strftime("%Y-%m-%d")
        return formatted_date 
    except:
        return None
if 'date_transform' not in functions:
    duckdb.create_function("date_transform", date_transform)  

def runtime_transform(s: str) -> int:
    try:
        return int(s.replace(' min', ''))
    except:
        return None
if 'runtime_transform' not in functions:
    duckdb.create_function("runtime_transform", runtime_transform)  



movies_details = duckdb.sql("""SELECT 
                            Title,
                            Year,
                            TRY_CAST(date_transform(Released) AS DATE) as Released,
                            runtime_transform(Runtime) as Runtime,
                            Rated,
                            list_sort(split_to_list(Director)) AS Director,
                            list_sort(split_to_list(Genre)) AS Genre,
                            list_sort(split_to_list(Writer)) AS Writer,
                            list_sort(split_to_list(Actors)) AS Actors,
                            Plot,
                            list_sort(split_to_list(Language)) AS Language,
                            list_sort(split_to_list(Country)) AS Country,
                            TRY_CAST(Metascore AS INT) AS Metascore,
                            TRY_CAST(imdbRating AS DOUBLE) AS imdbRating,
                            TRY_CAST(replace(imdbVotes, ',', '') AS INT) AS imdbVotes,
                            TRY_CAST(replace(replace(BoxOffice, ',', ''), '$','') as INT) as BoxOffice
                            FROM 'data/landing_zone/movies_details.json'""")


duckdb.sql("""
    COPY movies_details 
    TO 'data/transformed/movies_details' 
    (FORMAT PARQUET)""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))