# Data extraction 

On this notebook I will extract the data from my zenodo database to only keep the exact documents ParlaMint uses.

In [1]:
pip install congreso

Note: you may need to restart the kernel to use updated packages.


In [2]:
from congreso import congreso as c 
import pandas as pd
from matplotlib import pyplot as plt

terms = ["X", "XIV", "XIII", "XII", "XI"]
t = c.load_jsons(terms)

Now that the data is extracted we just need to use some simple functions to be able to filter the documents on congreso.db. This filters will filter by year, diario de sesiones, section and finaly organ.

In [3]:
def filter_DS_documents(term_dictionary):
  DS_term = []
  for doc in term_dictionary:
    if doc["encabezado"] == "DS":
      DS_term.append(doc)
  return DS_term



In [4]:
def filter_docs(docs, year):
    lowerbound = f"{year}0101"
    upperbound = f"{year}1231"
    # Filter documents based on the date range
    docs = filter_DS_documents(c.get_documents_interval_dates(docs, lowerbound, upperbound))
    
    # Filter by section
    filter_1 = []
    for doc in docs:
        if doc["secc"] in ["CONGRESO", "CONGRESO DE LOS DIPUTADOS"]:
            filter_1.append(doc)

    # Filter by organization
    filtered_docs = []
    for doc in filter_1:
        if doc["orga"] == "Pleno":
            filtered_docs.append(doc)

    # Sort by date
    filtered_docs = sorted(filtered_docs, key=lambda x: x['fecha'])
    
    # Return the filtered and sorted documents
    return filtered_docs


Finally, we now can get our filtered documents

In [5]:
d_2015 = filter_docs(t["X"], "2015")

d_2016_X = c.get_documents_interval_dates(t["X"], "20160101", "20161231")
d_2016_XI = c.get_documents_interval_dates(t["XI"], "20160101", "20161231")
d_2016_XII = c.get_documents_interval_dates(t["XII"], "20160101", "20161231")
docs_2016 = d_2016_XI + d_2016_XII + d_2016_X
d_2016 = filter_docs(docs_2016, "2016")

d_2017 = filter_docs(t["XII"], "2017")

d_2018 = filter_docs(t["XII"], "2018")

d_2019_XII = c.get_documents_interval_dates(t["XII"], "20190101", "20191231")
d_2019_XIII = c.get_documents_interval_dates(t["XIII"], "20190101", "20191231")
d_2019_XIV = c.get_documents_interval_dates(t["XIV"], "20190101", "20191231")
docs_2019 = d_2019_XII + d_2019_XIII + d_2019_XIV
d_2019 = filter_docs(docs_2019, "2019")

d_2020 = filter_docs(t["XIV"], "2020")

d_2021 = filter_docs(t["XIV"], "2021")

d_2022 = filter_docs(t["XIV"], "2022")

No documents between 2016-01-01 and 2016-12-31 for the term X


In [6]:
print("Docs 2015 filtered =",c.num_docs_term(d_2015))
print("Docs 2016 filtered =",c.num_docs_term(d_2016))
print("Docs 2017 filtered =",c.num_docs_term(d_2017))


print("Docs 2018 filtered =",c.num_docs_term(d_2018))
print("Docs 2019 filtered =",c.num_docs_term(d_2019))
print("Docs 2020 filtered =",c.num_docs_term(d_2020))

print("Docs 2021 filtered =",c.num_docs_term(d_2021))
print("Docs 2022 filtered =",c.num_docs_term(d_2022))


Docs 2015 filtered = 57
Docs 2016 filtered = 36
Docs 2017 filtered = 70
Docs 2018 filtered = 72
Docs 2019 filtered = 19
Docs 2020 filtered = 68
Docs 2021 filtered = 76
Docs 2022 filtered = 84


Now I save the data in json format

In [16]:
import json
import os

dicts_by_year = {
    2015: d_2015,
    2016: d_2016,
    2017: d_2017,
    2018: d_2018,
    2019: d_2019,
    2020: d_2020,
    2021: d_2021,
    2022: d_2022
}

for year, doc_dict in dicts_by_year.items():
    new_file = f"clean-data/d_{year}.json"
    with open(new_file, "w", encoding="utf-8") as f:
        json.dump(doc_dict, f, ensure_ascii=False, indent=4)
    print(f"Saved: {new_file}")



Saved: clean-data/d_2015.json
Saved: clean-data/d_2016.json
Saved: clean-data/d_2017.json
Saved: clean-data/d_2018.json
Saved: clean-data/d_2019.json
Saved: clean-data/d_2020.json
Saved: clean-data/d_2021.json
Saved: clean-data/d_2022.json


In [15]:
print(d_2019[0]["fecha"])

20190122
