<a href="https://colab.research.google.com/github/SocialxChange/Visualizations/blob/master/Tools/PadronElectoral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Descarga Padrón Electoral

Para el 2013 [este repositorio](https://github.com/aifuenzalida/Servel-Extract) extrae los datos de los PDF.

El siguiente script descarga el padrón siguiento [el repo aifuenzalida/padron-servel](https://github.com/aifuenzalida/padron-servel/blob/master/servel_download.py)

In [1]:
import re
import os
import requests
import shutil
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.request
from pathlib import Path


# Variables
url_padron = "https://www.servel.cl/padron-electoral-definitivo-y-nomina-de-inhabilitados-3/"
output_folder = "output"


# Open and parse page
request = requests.get(url_padron)
page = BeautifulSoup(request.text, 'lxml')


# Find all PDF files links
main_content = page.find(class_='tab-content')
links = main_content.find_all('a', href=re.compile("pdf"))

full_links = []

for link in links:
    full_link = urljoin(url_padron, link['href'])
    full_links.append(full_link)

print("Found " + str(full_links.count) + " links")

Found <built-in method count of list object at 0x7f027bca1690> links


Esta descarga toma 20 minutos:

In [2]:
cwd = os.getcwd()
output_path = cwd + '/' + output_folder
Path(output_path).mkdir(exist_ok=True)

for full_link in full_links:
    print("Downloading " + full_link)
    local_filename = full_link.split('/')[-1]

    with requests.get(full_link, stream=True) as r:
        with open(output_path + '/' + local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

print("End")

Downloading https://cdn.servel.cl/padron/A15101.pdf
Downloading https://cdn.servel.cl/padron/A15102.pdf
Downloading https://cdn.servel.cl/padron/A15202.pdf
Downloading https://cdn.servel.cl/padron/A15201.pdf
Downloading https://cdn.servel.cl/padron/A01107.pdf
Downloading https://cdn.servel.cl/padron/A01402.pdf
Downloading https://cdn.servel.cl/padron/A01403.pdf
Downloading https://cdn.servel.cl/padron/A01404.pdf
Downloading https://cdn.servel.cl/padron/A01101.pdf
Downloading https://cdn.servel.cl/padron/A01405.pdf
Downloading https://cdn.servel.cl/padron/A01401.pdf
Downloading https://cdn.servel.cl/padron/A02101.pdf
Downloading https://cdn.servel.cl/padron/A02201.pdf
Downloading https://cdn.servel.cl/padron/A02302.pdf
Downloading https://cdn.servel.cl/padron/A02102.pdf
Downloading https://cdn.servel.cl/padron/A02202.pdf
Downloading https://cdn.servel.cl/padron/A02203.pdf
Downloading https://cdn.servel.cl/padron/A02103.pdf
Downloading https://cdn.servel.cl/padron/A02104.pdf
Downloading 

In [5]:
import fitz
import pandas

doc = fitz.open('output/A06303.pdf') # Archivo origen
padron = []

for page in doc:
    print("Parsing Page " + str(page.number) + "/" + str(len(doc)))
    
    dic = page.getText("dict")

    # Campos están antecedidos por un dos puntos y espacio
    # ": DEL LIBERTADOR GENERAL BERNARDO O'HIGGINS"
    region = dic['blocks'][153]['lines'][1]['spans'][0]['text'][2:]
    provincia = dic['blocks'][154]['lines'][1]['spans'][0]['text'][2:]
    comuna = dic['blocks'][155]['lines'][1]['spans'][0]['text'][2:]

    # print(region, provincia, comuna)


    # Antes del bloque 156 se repite marca de agua SERVEL
    for block in dic['blocks'][157:]:
        # dependiendo de cuantas lineas tiene el bloque (parrafo) es como se interpreta el orden de los campos
        if len(block['lines']) == 6:
            nombre = block['lines'][0]['spans'][0]['text']
            ci = block['lines'][1]['spans'][0]['text']
            genero_direccion = block['lines'][2]['spans'][0]['text']
            gd_index = genero_direccion.find(' ')
            genero = genero_direccion[:gd_index]
            direccion = genero_direccion[gd_index+1:]
            circunscripcion = block['lines'][3]['spans'][0]['text']
            mesa = block['lines'][4]['spans'][0]['text']
            pueblo_indigena = block['lines'][5]['spans'][0]['text']

        else:
            nombre = block['lines'][0]['spans'][0]['text']
            ci = block['lines'][1]['spans'][0]['text']
            # ej: ' CONVENTO VIEJO 171 CALLE CONVENTO VIEJO CALLE CONVENTO VIEJO 171 CHIMBARONGO'
            genero_direccion = block['lines'][2]['spans'][0]['text']
            genero_index = genero_direccion.find(' ')
            genero = genero_direccion[:genero_index]
            direccion = genero_direccion[genero_index+1:]
            circunscripcion = block['lines'][3]['spans'][0]['text']
            mesa = block['lines'][4]['spans'][0]['text']
            pueblo_indigena = ""
        #print(nombre, ci, genero, direccion, circunscripcion, mesa, sep=',')

        padron.append({
            'Nombre': nombre,
            'CI': ci,
            'Genero': genero,
            'Direccion': direccion,
            'Circunscripcion': circunscripcion,
            'Mesa': mesa,
            'Region': region,
            'Provincia': provincia,
            'Comuna': comuna
            'Indigena': pueblo_indigena
        })
        
    #print('End page')

padron_df = pandas.DataFrame(padron)
padron_df.to_csv('./output_csv/A06303.csv', index=False) # CSV resultante

print('End')

SyntaxError: ignored