In [None]:
from datetime import date
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

df_records_raw = catalog.load('raw/oai/records#parquet')

In [None]:
df_records_raw

In [None]:
def _pick_load_dt(df: pd.DataFrame):
    # Si hay una sola fecha en el batch, usala; si hay varias, quedate con la mÃ¡s reciente;
    # si no hay, hoy.
    if 'load_datetime' not in df.columns or df['load_datetime'].isna().all():
        return date.today()
    vals = df['load_datetime'].dropna()
    if vals.nunique() == 1:
        return vals.iloc[0]
    return pd.to_datetime(vals).max().date()


In [None]:
def oai_load_records(df_records_raw: pd.DataFrame, env = 'dev')-> pd.DataFrame:

    df_records_raw = df_records_raw.copy()
    load_dt = _pick_load_dt(df_records_raw)

    if env == 'dev':
        df_records_raw = df_records_raw.head(1000)

    def _select(columns):
        return df_records_raw.loc[:, columns].copy()

    def _explode(column):
        return (
            _select(['record_id', column, 'extract_datetime'])
            .explode(column, ignore_index=True)
            .assign(load_datetime=load_dt)
        )

    df_records = _select(['record_id','col_id','title','date_issued', 'extract_datetime']).assign(load_datetime=load_dt)
    df_record_creators = _explode('creators')
    df_record_types = _explode('types')
    df_record_identifiers = _explode('identifiers')
    df_record_languages = _explode('languages')
    df_record_subjects = _explode('subjects')
    df_record_publishers = _explode('publishers')
    df_record_relations = _explode('relations')
    df_record_rights = _explode('rights')

    df_record_sets = _select(['record_id','set_id', 'extract_datetime'])
    sets_df = df_record_sets.pop('set_id').apply(pd.Series)
    sets_df = sets_df.rename(columns=lambda i: f'set_{i}')
    df_record_sets = pd.concat([df_record_sets, sets_df], axis=1)
    df_record_sets['load_datetime'] = load_dt

    return df_records, df_record_creators, df_record_types, df_record_identifiers, df_record_languages, df_record_subjects, df_record_publishers, df_record_relations, df_record_rights, df_record_sets


In [None]:
df_records, df_record_creators, df_record_types, \
    df_record_identifiers, df_record_languages, df_record_subjects, \
        df_record_publishers, df_record_relations, df_record_rights, \
             df_record_sets = oai_load_records(df_records_raw)

In [None]:
df_records

In [None]:
df_record_creators


In [None]:
df_record_types


In [None]:
df_record_identifiers

In [None]:
df_record_languages

In [None]:
df_record_subjects


In [None]:
df_record_publishers


In [None]:
df_record_relations


In [None]:
df_record_rights

In [None]:
df_record_sets