# TPC3: Processador de Pessoas listadas nos Róis de Confessados

Construa um ou vários programas Python para processar o texto 'processos.txt' (procurar o ficheiro no Bb) com o intuito de calcular frequências de alguns elementos (a ideia é utilizar arrays associativos, dicionários em Python, para o efeito) conforme solicitado a seguir:

    a) Calcula a frequência de processos por ano (primeiro elemento da data);

    b) Calcula a frequência de nomes próprios (o primeiro em cada nome) e apelidos (o ultimo em cada nome) por séculos e apresenta os 5 mais usados;

    c) Calcula a frequência dos vários tipos de relação: irmão, sobrinho, etc.;

    d) Converta os 20 primeiros registos num novo ficheiro de output mas em formato Json.


In [141]:
import json

# From TPC1
def show_dist(data, name):
    # header
        table=f"""{name}:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*"""
        # values
        data.sort(key=lambda x: x[1], reverse=True)
        for value, freq in data:
            table += f"""
| {value.center(11," ")} |  {freq:.7f}  |"""
        # end
        table += f"""
*---------------------------*"""
        print(table)

# From TPC1
def show_dist_top5(data, name):
    # header
        table=f"""{name}:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*"""
        # values
        data.sort(key=lambda x: x[1], reverse=True)
        i=0
        for value, freq in data:
            table += f"""
| {value.center(11," ")} |  {freq:.7f}  |"""
            i+=1
            if i==5:
                break
        # end
        table += f"""
*---------------------------*"""
        print(table)


def parse_family(family):
    family=family.split('.')
    if len(family)>=4 and family[-3]==' Proc':
        return {'name':family[-4].split(',')[:-1],
                'relation':family[-4].split(',')[-1], 
                'process':family[-2]}
    return ''

def parse_data(data_file):
    with open(data_file, 'r') as f:
        data = f.read().splitlines()
    data = [x.split('::') for x in data if x]
    data = [{'id': x[0], 'date': {'year': x[1].split('-')[0],'month': x[1].split('-')[1],'day': x[1].split('-')[2]}, 'name': x[2], 'father': x[3], 'mother': x[4], 
             'family': [parse_family(y) for y in x[5].split('  ') if y]} for x in data]
    return data
    
dictionary = parse_data('processos.txt')

print(dictionary[3])


{'id': '576', 'date': {'year': '1896', 'month': '11', 'day': '28'}, 'name': 'Abel Augusto Oliveira', 'father': 'Francisco Jose Oliveira', 'mother': 'Antonia Rosa Rebelo', 'family': [{'name': ['Jose Antonio Oliveira'], 'relation': 'Irmao', 'process': '5020'}]}


### A) Calcula a frequência de processos por ano (primeiro elemento da data)

In [142]:
def year_distribution(dictionary):
    years = {}
    total = 0
    dictionary.sort(key=lambda x: x['date']['year'])
    for x in dictionary:
        if x['date']['year'] in years:
            years[x['date']['year']] += 1
        else:
            years[x['date']['year']] = 1
        total += 1
    
    years = {k: v/total*100 for k, v in years.items()}
    
    return years

by_year=year_distribution(dictionary)
show_dist([(k,v) for k,v in by_year.items()], 'Distribution by Year')

Distribution by Year:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|     1732    |  5.8526927  |
|     1730    |  3.1678986  |
|     1733    |  3.1362196  |
|     1777    |  3.0147835  |
|     1691    |  2.8616684  |
|     1778    |  2.8141499  |
|     1731    |  2.3759240  |
|     1734    |  2.3442450  |
|     1785    |  2.1040127  |
|     1787    |  1.9614572  |
|     1689    |  1.6578669  |
|     1807    |  1.5892291  |
|     1773    |  1.2724393  |
|     1722    |  1.2064414  |
|     1728    |  1.1615628  |
|     1788    |  1.0982049  |
|     1719    |  1.0216473  |
|     1761    |  1.0058078  |
|     1784    |  0.9530095  |
|     1704    |  0.9450898  |
|     1786    |  0.9292503  |
|     1816    |  0.9292503  |
|     1762    |  0.9028511  |
|     1714    |  0.8368532  |
|     1821    |  0.8236536  |
|     1808    |  0.8210137  |
|     1783    |  0.8157339  |
|     1754    |  0.7998944  |
|     1755    |  0.7998944  |
|     1822    |  0

### B) Calcula a frequência de nomes próprios (o primeiro em cada nome) e apelidos (o ultimo em cada nome) por séculos e apresenta os 5 mais usados

In [143]:
def fname_by_years(dictionary):
    fnames = {}
    for x in dictionary:
        sec=str(int(x['date']['year'][:2])+1)
        name=x['name'].split(' ')[0]
        if sec in fnames:
            if name in fnames[sec]:
                fnames[sec][name]+=1
            else:
                fnames[sec][name]=1
        else:
            fnames[sec]={}
            fnames[sec][name]=1
    
    for x in fnames:
        for y in fnames[x]:
            fnames[x][y]=fnames[x][y]/sum(fnames[x].values())*100

    return fnames

names_by_years=fname_by_years(dictionary)

for x in names_by_years:
    show_dist_top5([(k,v) for k,v in names_by_years[x].items()], f'Names in Sec.{x}')

Names in Sec.17:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|    Manuel   |  32.1907085  |
|     Joao    |  15.8104089  |
|  Francisco  |  14.4469299  |
|   Domingos  |  12.9170857  |
|    Miguel   |  12.0120221  |
*---------------------------*
Names in Sec.18:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|    Manuel   |  43.3518284  |
|     Jose    |  22.1248776  |
|     Joao    |  20.7699101  |
|     Luis    |  16.0850063  |
|   Jeronimo  |  14.0442179  |
*---------------------------*
Names in Sec.19:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|    Manuel   |  45.6201591  |
|     Jose    |  37.1737635  |
|     Joao    |  17.2382486  |
|   Antonio   |  17.0564217  |
|  Francisco  |  10.3751540  |
*---------------------------*
Names in Sec.20:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|    Manuel   |  

In [144]:
def lname_by_years(dictionary):
    lnames = {}
    for x in dictionary:
        sec=str(int(x['date']['year'][:2])+1)
        name=x['name'].split(' ')[-1]
        if sec in lnames:
            if name in lnames[sec]:
                lnames[sec][name]+=1
            else:
                lnames[sec][name]=1
        else:
            lnames[sec]={}
            lnames[sec][name]=1
    
    for x in lnames:
        for y in lnames[x]:
            lnames[x][y]=lnames[x][y]/sum(lnames[x].values())*100

    return lnames

lnames_by_years=lname_by_years(dictionary)

for x in names_by_years:
    show_dist_top5([(k,v) for k,v in lnames_by_years[x].items()], f'Last Names in Sec.{x}')

Last Names in Sec.17:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|   Pereira   |  6.4902146  |
|    Araujo   |  5.2503588  |
|   Ribeiro   |  5.1187402  |
|    Silva    |  4.6346163  |
|    Costa    |  4.1256429  |
*---------------------------*
Last Names in Sec.18:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|   Carvalho  |  6.3998981  |
|    Sousa    |  5.8051617  |
|   Pereira   |  4.9656835  |
|    Silva    |  4.9032771  |
|   Antonio   |  4.6435318  |
*---------------------------*
Last Names in Sec.19:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|    Junior   |  7.6091515  |
|    Costa    |  5.7736320  |
|   Oliveira  |  5.3225898  |
|    Silva    |  4.4760921  |
|   Pereira   |  4.1718265  |
*---------------------------*
Last Names in Sec.20:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|    Silva  

### C) Calcula a frequência dos vários tipos de relação: irmão, sobrinho, etc.;

In [145]:
relatives = {}

for x in dictionary:
    for y in x['family']:
        if y:
            relat=y['relation']
            if relat.endswith('s'):
                relat=relat[:-1]
            if relat in relatives:
                relatives[relat]+=len(y['name'])
            else:
                relatives[relat]=len(y['name'])

relatives = {k: v/sum(relatives.values())*100 for k, v in relatives.items()}

show_dist([(k,v) for k,v in relatives.items()], 'Distribution by Relatives')

Distribution by Relatives:
*---------------------------*
|    Value    |  Frequency  |
*---------------------------*
|    Irmao    |  55.1505472  |
| Tio Materno |  8.2953638  |
| Tio Paterno |  8.0792898  |
| Sobrinho Materno |  7.4263704  |
| Sobrinho Paterno |  7.2149937  |
|    Primo    |  2.8089624  |
| Irmao Paterno |  2.0527033  |
|     Pai     |  2.0339142  |
|    Filho    |  1.5735826  |
| Primo Materno |  0.9159660  |
| Primo Paterno |  0.7092865  |
| Sobrinho Neto Materno |  0.6482221  |
| Tio Avo Materno |  0.6482221  |
| Tio Avo Paterno |  0.4274508  |
| Sobrinho Neto Paterno |  0.4133590  |
| Sobrinhos Materno |  0.3945700  |
| Sobrinhos Paterno |  0.2442576  |
| Irmao Materno |  0.2066795  |
| Neto Materno |  0.1831932  |
| Avo Materno |  0.1691014  |
| Irmaos Paterno |  0.0845507  |
| Tios Materno |  0.0845507  |
| Neto Paterno |  0.0375781  |
| Avo Paterno |  0.0375781  |
| Tios Paterno |  0.0281836  |
| Sobrinhos Netos Materno |  0.0234863  |
| Irmaos Materno |  0.018

### D) Converta os 20 primeiros registos num novo ficheiro de output mas em formato Json.

In [150]:
dictionary.sort(key=lambda x: x['name'])

f20={}
f20['Registos']=[]
for i in range(20):
    f20['Registos'].append(dictionary[i])

with open('output.json', 'w') as f:
    f.write(json.dumps(f20, indent=4))

print('All done!')

All done!
