In [1]:
"""

Documentation notes:

- By a laws' subject, I mean P1269 (facets of).


"""

"\n\nDocumentation notes:\n\n- By a laws' subject, I mean P1269 (facets of).\n\n\n"

In [2]:
from lxml import etree
import re
import requests
import collections

In [3]:
# Namespace
ns = {
    'srw_dc': 'info:srw/schema/1/dc-schema',
    'dc'    : 'http://purl.org/dc/elements/1.1/',
    'srw'   : 'http://www.loc.gov/zing/srw/',
    'xsi'   : 'http://www.w3.org/2001/XMLSchema'
}

In [4]:
"""
Law class that will store all of the scraped data.
"""
class law:
    def __init__(self):  
        self.tipoDocumento = ""
        self.date = ""
        self.urn = ""
        self.localidade = ""
        self.autoridade = ""
        self.title = ""
        self.description = ""
        self.identifier = ""
        self.subject = []
        
    def print_self(self):
        return (self.tipoDocumento + "*" + self.date + "*" + self.urn + \
               "*" + self.localidade + "*" + self.autoridade + "*" + self.title + \
               "*" + self.description + "*" + self.identifier + "*" + "*".join(self.subject) + "\n")

"""
Lexicon class that will store all subjects covered by laws, as well
as how many laws include it.
"""
class lexicon:
    def __init__(self, word, count):
        self.word = word
        self.count = count
        

In [5]:
"""
Extracts the relevant informations from the current item. Creates a law 
object and saves said data on it. 
"""
def get_values(x):
    new_law = law()
    for i in x.iter():
        tag = i.tag
        if tag == tD:
            new_law.tipo = i.text
        elif tag == date:
            new_law.date = i.text
        elif tag == urn:
            new_law.urn = i.text
        elif tag == localidade:
            new_law.localidade = i.text
        elif tag == autoridade:
            new_law.autoridade = i.text
        elif tag == title:
            new_law.title = i.text
        elif tag == description:
            if (i.text is not None):
                new_law.description = i.text
        elif tag == ID:
            new_law.identifier = i.text
        elif tag == subject:
            subjects = [x.strip() for x in re.split('\s[,.]\s', i.text)]
            unique_subjects = list(set(subjects))
            for i in range(0, len(unique_subjects)):
                if (" ." in unique_subjects[i]):
                    unique_subjects[i] = unique_subjects[i][:-2]
            new_law.subject = unique_subjects
    return new_law

In [6]:
def get_values_test(x):
    new_law = law()
    for i in x.iter():
        tag = i.tag
        if ("}") in tag:
            tag = tag.split("}")[1]
        if tag in attributes:
            if (tag != "subject"):
                setattr(new_law, tag, i.text)
            else:
                subjects = [x.strip() for x in re.split('\s[,.]\s', i.text)]
                unique_subjects = list(set(subjects))
                for i in range(0, len(unique_subjects)):
                    if (" ." in unique_subjects[i]):
                        unique_subjects[i] = unique_subjects[i][:-2]
                setattr(new_law, tag, unique_subjects)
    return new_law

attributes = ("tipoDocumento", "date", "urn", "localidade", "autoridade", "title", "description", "identifier", "subject")

In [7]:
"""
Gets the lexicon of subjects of all laws. Some laws list the 
same exact topic under it's subject twice. Those repetitions are discarded.
"""
def get_lexicon(laws):
    lex = []
    for l in laws:
        for s in l.subject:
            lex.append(s)
    lexicon = collections.Counter(lex)
    lexicon = sorted(lexicon.items(), key = lambda lex: lex[1], reverse = True)
    return lexicon


"""
Prints the lexicon into file_name_lexicon.txt, in the format:
total_of_occurrences*word
"""
def print_lexicon(lexicon, file_name):
    file_lex = open(file_name + "_lexicon.txt", "w")
    for key, value in lexicon:
        file_lex.write(str(value) + "*" + key + "\n")
    file_lex.close()
    
    
"""    
Prints all data scraped from the laws into file_name.txt, 
separeted by *, in the format:
Type_of_document*date*urn*locality*authority*title*description*identifier*subjects
"""
def print_scraped_info(laws, file_name):
    file = open(file_name + ".txt", "w")  
    file.write("tipo de documento*data*urn*localidade*autoridade*" + \
               "título*descricao*identifier*assuntos->\n")
    for i in laws:
        file.write(i.print_self())
    file.close()
        
        
"""    
Scrapes all laws in url_base, in the range [1, 500 * n]. Returns a list 
of objects with each laws' attributes in laws[], and prints the result to 
file_name.txt. Entries are separated by *. If lexicon_flag is useg, a 
file_name_lexicon.txt is also generated with the lexicon of subjects in 
descending order of how often that subject as appeared.
"""
def scrape_site (url_base, laws, file_name, n, lexicon_flag = False):
    for i in range(0, n):
        url = url_base + str(i * 500 + 1)
        req = requests.request('GET', url)
        tree = etree.fromstring(req.content)

        # x stands for each entry in <srw_dc:dc>
        for x in tree.findall(".//srw_dc:dc", namespaces=ns):
            new_law = get_values_test(x)
            laws.append(new_law)
            
        # Being polite
        time.sleep(2)
            
    if (lexicon_flag):
        lexicon = get_lexicon(laws)
        print_lexicon(lexicon, file_name)
    
    print_scraped_info(laws, file_name)
    
    return laws


In [8]:
# Url to scrape all federal laws (leis federais)
#url = "https://www.lexml.gov.br/busca/SRU?operation=searchRetrieve&query=urn+=%22lei+federal%22&maximumRecords=500&startRecord=" 

# Url to scrape all law-decrees (decretos-lei)
url = "https://www.lexml.gov.br/busca/SRU?operation=searchRetrieve&query=urn+=%22federal+decreto.lei%22&maximumRecords=500&startRecord="

laws = []
laws = scrape_site(url, laws, "leis2222", 2, True)

The two lexicon files generated can be merged in the command line using:

> sort file1.txt file2.txt | uniq > exit_file

In [74]:
"""
Receives a list of string with previously generated lexicon file names. 
Generates a file named merged_lexicon_files.txt in the current folder, 
with the sum of how many times each term has apparead in each lexicon file.
"""
def merge_lexicon_files (list_of_files):
    lex = {}
    for file in list_of_files:
        f = open(file)
        for line in f.readlines():
            value, key = line.split("*")
            key = key.strip()
            lex[key] = lex.get(key, 0) + int(value)
            
    lex = sorted(lex.items(), key = lambda l: l[1], reverse = True)
    
    file = open("merged_lexicon_files.txt", "w")
    for k, v in lex:
        file.write(str(v) + "*" + k + "\n")
    file.close()
            
lex = merge_lexicon_files(["decretos-lei-complete_lexicon.txt", "leis-complete_lexicon.txt"])