In [2]:
"""
LEXML SCRAPER

Scrapes the information regarding laws available on https://www.lexml.gov.br/. 
By using a lexicon_flag, you can also generate an extra file with all law's subjects
and how frequently they appareted. (law's subjects = P1269 (facets of))

To scrape different types of law, change the keywords in the search URL. 

The output files are .txt separated by *. 
"""

"\nLEXML SCRAPER\n\nScrapes the information regarding laws available on https://www.lexml.gov.br/. \nBy using a lexicon_flag, you can also generate an extra file with all law's subjects\nand how frequently they appareted. (law's subjects = P1269 (facets of))\n\nTo scrape different types of law, change the keywords in the search URL. \n\nThe output files are .txt separated by *. \n"

In [3]:
from lxml import etree
import re
import requests
import collections
import time

# Namespace
ns = {
    'srw_dc': 'info:srw/schema/1/dc-schema',
    'dc'    : 'http://purl.org/dc/elements/1.1/',
    'srw'   : 'http://www.loc.gov/zing/srw/',
    'xsi'   : 'http://www.w3.org/2001/XMLSchema'
}

In [4]:
"""
Law class that will store all of the scraped data.
"""
class law:
    def __init__(self):  
        self.tipoDocumento = ""
        self.date = ""
        self.urn = ""
        self.localidade = ""
        self.autoridade = ""
        self.title = ""
        self.description = ""
        self.identifier = ""
        self.subject = []
        
    def print_self(self):
        return (self.tipoDocumento + "*" + self.date + "*" + self.urn + \
               "*" + self.localidade + "*" + self.autoridade + "*" + self.title + \
               "*" + self.description + "*" + self.identifier + "*" + "*".join(self.subject) + "\n")

"""
Used for keeping track of how many times each law subject was used.
"""
class lexicon:
    def __init__(self, word, count):
        self.word = word
        self.count = count
        

In [7]:
"""
Receives an x item, extracts all attributes in it and saves them to a 
new_law object, which is returned.
"""
def get_values(x):
    new_law = law()
    for i in x.iter():
        tag = i.tag
        if ("}") in tag:
            tag = tag.split("}")[1]
        if tag in attributes:
            if (tag != "subject"):
                if (i.text):
                    i.text = i.text.replace("\n", "")
                setattr(new_law, tag, i.text)
            else:
                subjects = [x.strip() for x in re.split('\s[,.]\s', i.text)]
                unique_subjects = list(set(subjects))
                for i in range(0, len(unique_subjects)):
                    if (" ." in unique_subjects[i]):
                        unique_subjects[i] = unique_subjects[i][:-2]
                setattr(new_law, tag, unique_subjects)
    return new_law

"""
Receives a list of laws. Returns a lexicon containing all of the law's
subjects and how frequently they were used.
Duplicate subjects within the same law are ignored. 
"""
def get_lexicon(laws):
    lex = []
    for l in laws:
        for s in l.subject:
            lex.append(s)
    lexicon = collections.Counter(lex)
    lexicon = sorted(lexicon.items(), key = lambda lex: lex[1], reverse = True)
    return lexicon


"""
Receives a lexicon list and prints it's content into file_name_lexicon.txt.
"""
def print_lexicon(lexicon, file_name):
    file_lex = open(file_name + "_lexicon.txt", "w")
    for key, value in lexicon:
        file_lex.write(str(value) + "*" + key + "\n")
    file_lex.close()
    
    
"""    
Prints all data scraped from the laws into file_name.txt, 
separeted by *, in the format:
Type_of_document*date*urn*locality*authority*title*description*identifier*subjects
"""
def print_scraped_info(laws, file_name):
    file = open(file_name + ".txt", "w")  
    file.write("tipo de documento*data*urn*localidade*autoridade*" + \
               "título*descricao*identifier*assuntos->\n")
    for i in laws:
        file.write(i.print_self())
    file.close()

In [8]:
# XML attributes that will be extracted
attributes = ("tipoDocumento", "date", "urn", "localidade", "autoridade", "title", "description", "identifier", "subject")


# federal laws (leis federais)
search_term = "%22federal+decreto.lei%22" #"%22lei+federal%22"
base_url = "https://www.lexml.gov.br/busca/SRU?operation=searchRetrieve&query=urn+=" + search_term + "&maximumRecords=500&startRecord="


file_name = "decretos_lei"
n = 1
lexicon_flag = True

if __name__ == "__main__":
    laws = []
    for i in range(0, n):
        url = base_url + str(i * 500 + 1)
        req = requests.request('GET', url)
        tree = etree.fromstring(req.content)

        # x stands for each entry in <srw_dc:dc>
        for x in tree.findall(".//srw_dc:dc", namespaces=ns):
            new_law = get_values(x)
            laws.append(new_law)
            
        # Being polite
        time.sleep(2)
            
    if (lexicon_flag):
        lexicon = get_lexicon(laws)
        print_lexicon(lexicon, file_name)

    print_scraped_info(laws, file_name)

The two lexicon files generated can be merged in the command line using:

> sort file1.txt file2.txt | uniq > exit_file

In [None]:
"""
Receives a list of string with previously generated lexicon file names. 
Generates a file named merged_lexicon_files.txt in the current folder, 
with the sum of how many times each term has apparead in each lexicon file.
"""
def merge_lexicon_files (list_of_files):
    lex = {}
    for file in list_of_files:
        f = open(file)
        for line in f.readlines():
            value, key = line.split("*")
            key = key.strip()
            lex[key] = lex.get(key, 0) + int(value)
            
    lex = sorted(lex.items(), key = lambda l: l[1], reverse = True)
    
    file = open("merged_lexicon_files.txt", "w")
    for k, v in lex:
        file.write(str(v) + "*" + k + "\n")
    file.close()
            
lex = merge_lexicon_files(["decretos-lei-complete_lexicon.txt", "leis-complete_lexicon.txt"])

In [3]:
from platform import python_version

print(python_version())

3.8.5
