In [1]:
import pandas as pd
import numpy as np
import os
import glob
import json
import datetime
from dateutil.parser import parse
import gc

In [2]:
s6_df = pd.read_json("ver_releases.json")

In [3]:
s6_df.columns

Index(['_id', 'ocid', 'id', 'date', 'tag', 'initiationType', 'parties',
       'planning', 'tender', 'buyer', 'awards', 'contracts', 'language'],
      dtype='object')

In [4]:
s6_df['_id.$oid'] = s6_df['_id'].apply(lambda x: x.get('$oid'))

In [5]:
keep_cols = ['_id.$oid', "ocid", "id", "parties", "awards"]
s6_df = s6_df[keep_cols]

In [6]:
s6_df.columns

Index(['_id.$oid', 'ocid', 'id', 'parties', 'awards'], dtype='object')

In [7]:
s6_df.head()

Unnamed: 0,_id.$oid,ocid,id,parties,awards
0,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,"[{'name': 'SESEA', 'id': 'MX-RFC-SES171122F80'...","[{'id': 'Marco', 'status': 'active', 'date': '..."
1,650873de6726d47394a0b0a5,ocds-qdg43v-SESEAVER-SE-DA-RMySG-409-2022,ocds-qdg43v-SESEAVER-SE-DA-RMySG-409-2022.v28....,"[{'name': 'SESEA', 'id': 'MX-RFC-SES171122F80'...","[{'id': 'SESEAVER/SE/DA/RMySG/409/2022', 'titl..."
2,650873de6726d47394a0b0a6,ocds-qdg43v-SESEAVER-SE-DA-RMySG-360-2022,ocds-qdg43v-SESEAVER-SE-DA-RMySG-360-2022.v20....,"[{'name': 'SEA', 'id': 'MX-RFC-SES171122F80', ...","[{'id': 'SESEAVER/SE/DA/RMySG/360/2022', 'titl..."
3,650873de6726d47394a0b0ac,ocds-qdg43v-SESEAVER-SE-DA-RMySG-408-2022,ocds-qdg43v-SESEAVER-SE-DA-RMySG-408-2022.v16....,"[{'name': 'SESEA', 'id': 'MX-RFC-SES171122F80'...","[{'id': 'SESEAVER/SE/DA/RMySG/408/2022', 'titl..."
4,650873de6726d47394a0b0a7,ocds-qdg43v-SESEAVER-SE-DA-RMySG-414-2022,ocds-qdg43v-SESEAVER-SE-DA-RMySG-414-2022.v27....,"[{'name': 'SEA', 'id': 'MX-RFC-SES171122F80', ...","[{'id': 'SESEAVER/SE/DA/RMySG/414/2022', 'titl..."


In [8]:
def parse_date(x):
    try:
        return parse(x)
    except:
        return None

In [9]:
def find_start_date(awards):
    start_dates = []
    end_dates = []
    for award in awards:
        start_date = parse_date(award.get("contractPeriod", {}).get("startDate"))
        if start_date:
            start_dates.append(start_date)
        end_date = parse_date(award.get("contractPeriod",{}).get("endDate"))
        if end_date:
            end_dates.append(end_date)
    
    if len(start_dates) > 0:
        earliest_start = min(start_dates)
    else:
        earliest_start = np.datetime64('NaT')
        
    if len(start_dates) > 0:
        oldest_end = max(end_dates)
    else:
        oldest_end = np.datetime64('NaT')

    return earliest_start, oldest_end
    

In [10]:
res = s6_df.awards.apply(find_start_date)

In [11]:
s6_df["contractPeriod_startDate"], s6_df["contractPeriod_endDate"] = zip(*res)
s6_df = s6_df.drop(columns=["awards"])

In [12]:
s6_df = s6_df.explode("parties")

In [13]:
s6_df.head()

Unnamed: 0,_id.$oid,ocid,id,parties,contractPeriod_startDate,contractPeriod_endDate
0,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,"{'name': 'SESEA', 'id': 'MX-RFC-SES171122F80',...",2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00
0,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,"{'name': 'Marco Gonzalo Hernández Aburto', 'id...",2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00
0,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,"{'name': 'Vinculación Interinstitucional ', 'i...",2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00
0,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,"{'name': 'Recursos Materiales', 'id': 'MX-RFC-...",2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00
0,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,"{'name': 'José Antonio Lara Apodaca', 'id': 'M...",2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00


In [14]:
def extract_parties_names(parties):
    name = parties.get("name")
    contact = parties.get("contactPoint", {}).get("name")
    return name, contact

In [15]:
res_contact = s6_df.parties.apply(extract_parties_names)
s6_df["parties_name"], s6_df["parties_contactPoint_name"] = zip(*res_contact)
s6_df = s6_df.drop(columns=["parties"])

In [16]:
s6_df = s6_df.reset_index(drop = True)

In [17]:
s6_df.head()

Unnamed: 0,_id.$oid,ocid,id,contractPeriod_startDate,contractPeriod_endDate,parties_name,parties_contactPoint_name
0,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00,SESEA,
1,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00,Marco Gonzalo Hernández Aburto,Marco Gonzalo Hernández Aburto
2,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00,Vinculación Interinstitucional,Laura Fernández León
3,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00,Recursos Materiales,Raymundo Ojeda Morales
4,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14 00:00:00-06:00,2022-12-14 00:00:00-06:00,José Antonio Lara Apodaca,José Antonio Lara Apodaca


In [18]:
s6_df = s6_df.dropna(subset = ["contractPeriod_startDate", "contractPeriod_endDate"], how = "any")

In [19]:
s6_df['contractPeriod_startDate'] = s6_df['contractPeriod_startDate'].apply(lambda x: pd.to_datetime(x, utc=True))
s6_df['contractPeriod_endDate'] = s6_df['contractPeriod_endDate'].apply(lambda x: pd.to_datetime(x, utc=True))

s6_df['contractPeriod_startDate'] = s6_df['contractPeriod_startDate'].dt.date
s6_df['contractPeriod_endDate'] = s6_df['contractPeriod_endDate'].dt.date


In [20]:
s6_df.head()

Unnamed: 0,_id.$oid,ocid,id,contractPeriod_startDate,contractPeriod_endDate,parties_name,parties_contactPoint_name
0,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14,2022-12-14,SESEA,
1,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14,2022-12-14,Marco Gonzalo Hernández Aburto,Marco Gonzalo Hernández Aburto
2,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14,2022-12-14,Vinculación Interinstitucional,Laura Fernández León
3,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14,2022-12-14,Recursos Materiales,Raymundo Ojeda Morales
4,650873de6726d47394a0b0a4,ocds-qdg43v-RM-181,ocds-qdg43v-RM-181.v34.implementation,2022-12-14,2022-12-14,José Antonio Lara Apodaca,José Antonio Lara Apodaca


In [21]:
s6_df.to_parquet("s6_small.parquet")