In [2]:
import pandas as pd
from pathlib import Path
from typing import List, Tuple

In [5]:
#Define the required paramethers - time range - list of actors - define output folder and fime name
def get_parameters() -> Tuple[str, str, Path, Path, Path, int, bool]:
    """
    Define global parameters.
    """
    start_date = "1791-01-01"
    end_date = "2026-01-01"
    actor_file = Path("Actors.csv")
    output_dir = Path("parquet_output")
    excel_file = Path("data_parquet_name_actor_map.xlsx")
    actors_per_batch = 50

    # united dataset control
    make_united_dataset = False   # <-- TURN ON / OFF HERE I recomend False - It consume too MUCH RAM 
    united_file = Path("actor_day_dataset_united.parquet")

    output_dir.mkdir(parents=True, exist_ok=True)
    return start_date, end_date, actor_file, output_dir, excel_file, actors_per_batch, united_file, make_united_dataset


##read actors from file
def read_actors_from_csv(actor_file: Path, actor_col: str = "Actor_Name") -> List[str]:

    """
    Read actor list from a CSV file.
    """
    df = pd.read_csv(actor_file)

    if actor_col not in df.columns:
        raise ValueError(f"Column '{actor_col}' not found in {actor_file}")

    return df[actor_col].dropna().astype(str).tolist()

#generate time range
def generate_daily_dates(start_date: str, end_date: str) -> pd.DatetimeIndex:
    """
    Generate a daily date range.
    """
    return pd.date_range(start=start_date, end=end_date, freq="D")


# create records - time - day month year - actor
def create_actor_day_block(actor: str, dates: pd.DatetimeIndex) -> pd.DataFrame:
    """
    Create dataset with one row per actor per day,
    including date, day, month, and year columns.
    """
    df = pd.DataFrame({
        "date": dates,
        "actor": actor
    })

    df["day"] = df["date"].dt.day
    df["month"] = df["date"].dt.month
    df["year"] = df["date"].dt.year

    return df


#generate dataset
def build_dataset_parquet():

    """
    Main pipeline.
    """

    #read paramethers
    start_date, end_date, actor_file, output_dir, excel_file, actors_per_batch, united_file, make_united_dataset = get_parameters()


    actors = read_actors_from_csv(actor_file)
    dates = generate_daily_dates(start_date, end_date)

    buffer = []
    batch_id = 1

    # mapping list
    mapping = []

    for i, actor in enumerate(actors, start=1):
        df_actor = create_actor_day_block(actor, dates)
        buffer.append(df_actor)

        # when buffer reaches 5 actors → save
        if i % actors_per_batch == 0:
            df_out = pd.concat(buffer, ignore_index=True)
            file_name = f"actor_batch_{batch_id:04d}.parquet"
            out_file = output_dir / file_name
            df_out.to_parquet(out_file, engine="pyarrow", index=False)
            print(f"Saved {file_name} ({len(df_out):,} rows)")

            # map subdataset name to actors each included (50 actor each)
            batch_actors = actors[i-actors_per_batch:i]  # list of 5 actors
            mapping.append({
                "parquet_file": file_name,
                "actors": ", ".join(batch_actors)
            })

            buffer.clear()
            batch_id += 1

    # save remaining actors
    if buffer:
        df_out = pd.concat(buffer, ignore_index=True)
        file_name = f"actor_batch_{batch_id:04d}.parquet"
        out_file = output_dir / file_name
        df_out.to_parquet(out_file, engine="pyarrow", index=False)
        print(f"Saved {file_name} ({len(df_out):,} rows)")
        
        # map subdataset name to actors each included (50 actor each)
        batch_actors = actors[(batch_id-1)*actors_per_batch:]  # remaining actors
        mapping.append({
            "parquet_file": file_name,
            "actors": ", ".join(batch_actors)
        })

    #save mapping Excel
    df_map = pd.DataFrame(mapping)
    df_map.to_excel(excel_file, index=False)
    print(f"✅ Saved Excel mapping: {excel_file}")

    # -------------------------
    # OPTIONAL: build united dataset
    # -------------------------
    if make_united_dataset:
        combine_parquet_to_united_file(output_dir, united_file)
        print("\nunited dataset saved")



#read the first and last subdataset to view head and tail of the united dataset    
def read_united_parquet_dataset_preview():
    _, _, _, output_dir, _, _ ,  _, _= get_parameters()
    files = sorted(output_dir.glob("*.parquet"))

    print("\nDataset head:")
    print(pd.read_parquet(files[0]).head())

    print("\nDataset tail:")
    print(pd.read_parquet(files[-1]).tail())
    

#Function for merging all sub-dataet and make a united dataset if needed. I do not recommend because it consume tooMUCH RAM   
def combine_parquet_to_united_file(output_dir: Path, united_file: Path):
    """
    Read all Parquet files in `output_dir`, combine into one DataFrame,
    and save as a single Parquet file (`united_file`).
    """

    # find all parquet files
    files = sorted(output_dir.glob("*.parquet"))
    # read all files and concatenate
    df_list = []
    for f in files:
        df_list.append(pd.read_parquet(f))
    df_united = pd.concat(df_list, ignore_index=True)

    # save as one Parquet file
    df_united.to_parquet(united_file, engine="pyarrow", index=False)
    print(f"✅ Combined dataset saved to {united_file} ({len(df_united):,} rows)")

    # preview
    print("\nDataset head:")
    print(df_united.head())
    print("\nDataset tail:")
    print(df_united.tail())

    return df_united


In [6]:
if __name__ == "__main__":

    #Generate ddatasets
    build_dataset_parquet()

    #read the first and last datsets to show the preview of head and tail of the united dataset
    read_united_parquet_dataset_preview()



Saved actor_batch_0001.parquet (4,291,650 rows)
Saved actor_batch_0002.parquet (4,291,650 rows)
Saved actor_batch_0003.parquet (4,291,650 rows)
Saved actor_batch_0004.parquet (4,291,650 rows)
Saved actor_batch_0005.parquet (4,291,650 rows)
Saved actor_batch_0006.parquet (4,291,650 rows)
Saved actor_batch_0007.parquet (4,291,650 rows)
Saved actor_batch_0008.parquet (4,291,650 rows)
Saved actor_batch_0009.parquet (4,291,650 rows)
Saved actor_batch_0010.parquet (4,291,650 rows)
Saved actor_batch_0011.parquet (4,291,650 rows)
Saved actor_batch_0012.parquet (4,291,650 rows)
Saved actor_batch_0013.parquet (4,291,650 rows)
Saved actor_batch_0014.parquet (4,291,650 rows)
Saved actor_batch_0015.parquet (4,291,650 rows)
Saved actor_batch_0016.parquet (4,291,650 rows)
Saved actor_batch_0017.parquet (4,291,650 rows)
Saved actor_batch_0018.parquet (4,291,650 rows)
Saved actor_batch_0019.parquet (4,291,650 rows)
Saved actor_batch_0020.parquet (4,291,650 rows)
Saved actor_batch_0021.parquet (4,291,65