In [1]:
import pandas as pd
import numpy as np
import os
import glob
import json
import datetime
from dateutil.parser import parse
import gc
from tqdm import tqdm
tqdm.pandas()

In [2]:
s6_df = pd.read_hdf("s6_hdf.h5")

In [3]:
def parse_date(x):
    try:
        return parse(x)
    except:
        return None
    
def find_earliest_date(dates):
    try:
        if len(dates) > 0:
            dates = [parse_date(date) for date in dates]
            dates = [x for x in dates if x is not None]
            if len(dates) > 0:
                earliest = min(dates)
            else:
                earliest = None
        else:
            earliest = None
        return earliest
    except:
        return None
    
            
def find_latest_date(dates):
    try:
        if len(dates) > 0:
            dates = [parse_date(date) for date in dates]
            dates = [x for x in dates if x is not None]
            if len(dates) > 0:
                latest = max(dates)
            else:
                latest = None
        else:
            latest = None
        return latest
    except:
        return None
        

In [4]:
s6_df.head()

Unnamed: 0,_id.$oid,ocid,id,contractPeriod_startDate,contractPeriod_endDate,parties_name,parties_contactPoint_name
0,62a913e7db2a665c4fce6223,ocds-ty10ed-03-06-2022-10:13:43,11,[2022-06-14T11:21:02Z],[2022-06-14T11:21:02Z],SESEA AGUASCALIENTES,Monica Lizbeth de la Cruz Ortega
1,62a913e7db2a665c4fce6223,ocds-ty10ed-03-06-2022-10:13:43,11,[2022-06-14T11:21:02Z],[2022-06-14T11:21:02Z],Nissan Torres Corzo,Grupo Torres Corzo Automotriz de Aguascaliente...
2,62a913e7db2a665c4fce6226,ocds-ty10ed-07-06-2022-10:37:52,14,[2022-06-14T11:21:02Z],[2022-06-14T11:21:02Z],SESEA AGUASCALIENTES,Monica Lizbeth de la Cruz Ortega
3,62a913e7db2a665c4fce6226,ocds-ty10ed-07-06-2022-10:37:52,14,[2022-06-14T11:21:02Z],[2022-06-14T11:21:02Z],Grant Limpieza y mantenimiento,Grant Limpieza y mantenimiento
4,62a913e7db2a665c4fce6225,ocds-ty10ed-06-06-2022-08:06:10,13,[2022-06-14T11:21:02Z],[2022-06-14T11:21:02Z],SESEA AGUASCALIENTES,Monica Lizbeth de la Cruz Ortega


In [5]:
s6_df["earliest_contractPeriod_startDate"] = s6_df.contractPeriod_startDate.progress_apply(find_earliest_date)
s6_df["latest_contractPeriod_endDate"] = s6_df.contractPeriod_endDate.progress_apply(find_latest_date)

  0%|          | 0/4077798 [00:00<?, ?it/s]

100%|██████████| 4077798/4077798 [13:34<00:00, 5006.26it/s] 
100%|██████████| 4077798/4077798 [13:35<00:00, 5001.17it/s] 


In [10]:
s6_df = s6_df.drop(columns=["contractPeriod_startDate", "contractPeriod_endDate"])

In [14]:
s6_df['earliest_contractPeriod_startDate'] = s6_df['earliest_contractPeriod_startDate'].progress_apply(lambda x: pd.to_datetime(x, utc=True))
s6_df['latest_contractPeriod_endDate'] = s6_df['latest_contractPeriod_endDate'].progress_apply(lambda x: pd.to_datetime(x, utc=True))

s6_df['earliest_contractPeriod_startDate'] = s6_df['earliest_contractPeriod_startDate'].dt.date
s6_df['latest_contractPeriod_endDate'] = s6_df['latest_contractPeriod_endDate'].dt.date

In [15]:
s6_df

Unnamed: 0,_id.$oid,ocid,id,parties_name,parties_contactPoint_name,earliest_contractPeriod_startDate,latest_contractPeriod_endDate
0,62a913e7db2a665c4fce6223,ocds-ty10ed-03-06-2022-10:13:43,11,SESEA AGUASCALIENTES,Monica Lizbeth de la Cruz Ortega,2022-06-14,2022-06-14
1,62a913e7db2a665c4fce6223,ocds-ty10ed-03-06-2022-10:13:43,11,Nissan Torres Corzo,Grupo Torres Corzo Automotriz de Aguascaliente...,2022-06-14,2022-06-14
2,62a913e7db2a665c4fce6226,ocds-ty10ed-07-06-2022-10:37:52,14,SESEA AGUASCALIENTES,Monica Lizbeth de la Cruz Ortega,2022-06-14,2022-06-14
3,62a913e7db2a665c4fce6226,ocds-ty10ed-07-06-2022-10:37:52,14,Grant Limpieza y mantenimiento,Grant Limpieza y mantenimiento,2022-06-14,2022-06-14
4,62a913e7db2a665c4fce6225,ocds-ty10ed-06-06-2022-08:06:10,13,SESEA AGUASCALIENTES,Monica Lizbeth de la Cruz Ortega,2022-06-14,2022-06-14
...,...,...,...,...,...,...,...
4077793,650873de6726d47394a0b0ca,ocds-j3etk6-LS-IVAI-004-2021,ocds-j3etk6-LS-IVAI-004-2021.v721.implementation,OFIX,ARMANDO AGUIRRE VALDÉS,2021-07-09,2021-08-25
4077794,650873de6726d47394a0b0ca,ocds-j3etk6-LS-IVAI-004-2021,ocds-j3etk6-LS-IVAI-004-2021.v721.implementation,TREVIÑO COMPUTACIÓN,GRACIELA AMANDA GARCIA TREVIÑO,2021-07-09,2021-08-25
4077795,650873de6726d47394a0b0ca,ocds-j3etk6-LS-IVAI-004-2021,ocds-j3etk6-LS-IVAI-004-2021.v721.implementation,EQUIPOS DE OFICINA DE VERACRUZ S.A. DE C.V.,EDGAR RUIZ LÓPEZ,2021-07-09,2021-08-25
4077796,650873de6726d47394a0b0ca,ocds-j3etk6-LS-IVAI-004-2021,ocds-j3etk6-LS-IVAI-004-2021.v721.implementation,TOTAL COPIERS,TIRZO ANTONIO TRONCO MORALES,2021-07-09,2021-08-25


In [16]:
s6_df.to_hdf("s6_hdf_dates.h5", key = "s6_df")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->Index(['_id.$oid', 'ocid', 'id', 'parties_name', 'parties_contactPoint_name',
       'earliest_contractPeriod_startDate', 'latest_contractPeriod_endDate'],
      dtype='object')]

  s6_df.to_hdf("s6_hdf_dates.h5", key = "s6_df")
