In [1]:
# Reload all modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import json
import glob
from urllib.parse import urljoin
import requests
from pathlib import Path
from tqdm import tqdm
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
load_dotenv()

import cafeconmiel.data.parse_html as parse_html
import cafeconmiel.data.metadata as metadata
import cafeconmiel.data.text_process as text_process
import cafeconmiel.data.parse_docs as parse_docs
import cafeconmiel.utils.paths as paths_utils

In [91]:
paths = paths_utils.ProjectPaths()
interim_data = paths.interim_data
raw_data = paths.raw_data
letters_path = paths.ext_data / 'Cartas-txt'
legal_docs_path = paths.ext_data / 'documentos arreglados CorpMA'

In [4]:
meta_fields = ['meta_id', 'format', 'corpus', 'unknown_id', 'date (place)', 'doc_type', 'abstract', 'author']
es_months = ['enero', 'febrero' ,'marzo', 'abril', 'mayo', 'junio', 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre']
es_month_name_to_number = {name: str(i+1).zfill(2) for i, name in enumerate(es_months)}
# add user agent to trick website into thinking we're accessing from a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
}

In [5]:
with open(paths.ext_data / 'corpora.json') as f:
    corpus_metadata = json.load(f)
doc_url_patt = "{corpus_base_url}/documento.php"

In [6]:
field_norm = {
    'IDENTIFICADOR': 'meta_id',
    'REGESTO': 'abstract',
    'DOCUMENTO': 'abstract',
    'PAÍS': 'country',
    'PROVINCIA': 'region',
    'POBLACIÓN': 'locality',
    'FECHA': 'year',
    'SIGLO': 'century',
    'TIPOLOGÍA': 'doc_type',
    'TIP. DOCUMENTAL': 'doc_type',
    'TIP.DOCUMENTAL': 'doc_type',
    'TIP. DIPLOMÁTICA': 'diplo_type', # TODO: ??
    'ARCHIVO (SIGN.)': 'archive',
    'PALABRAS': 'nr_words',
    'MUJER': 'woman',
    'LETRA': 'writing',
    'ÁMBITO': 'context',
    'CLAVE': 'keywords',
    'COPISTA (FÓRM.)': 'copyist',
    'DOCUMENTO': 'meta_id',
    'ARCHIVO': 'archive',
    'AÑO': 'year',
    'LUGAR': 'locality',
    'TRANSCRIPCIÓN PALEOGRÁFICA': 'raw_text',
}

In [77]:
corpus_name = 'corpusmallorca'
# corpus_name = 'corpuscodea'
corpus_dict = corpus_metadata[corpus_name]
corpus_dir = interim_data / corpus_name
corpus_dir.mkdir(exist_ok=True, parents=True)
docs_url = doc_url_patt.format(corpus_base_url=corpus_dict['base_url'])

# Extract data from corpus

## Online 

In [13]:
if 'local_doc_list' in corpus_dict:
    doc_list_path = data_dir / 'external' / corpus_name / corpus_dict['local_doc_list']
    with open(doc_list_path, 'r') as f:
        doc_list_html = f.read()
else:
    doc_list_url = urljoin(corpus_dict['base_url'], corpus_dict['remote_doc_list'])
    doc_list_html = requests.get(doc_list_url, headers=headers).content
doc_list_soup = BeautifulSoup(doc_list_html, 'html.parser')

In [14]:
docs_to_exclude = {p.stem for p in corpus_dir.iterdir()}

In [82]:
if corpus_name == 'corpusmallorca':
    doc_ids = []
    for link in doc_list_soup.find_all('a'):
        doc_id = re.match("javascript:abrirDocumento\('(.*)'\)", link.get('href')).groups()[0]
        if doc_id not in docs_to_exclude:
            doc_ids.append(doc_id)

    for doc_id in tqdm(doc_ids):
        response = requests.get(docs_url, headers=headers, params={'documento': doc_id, 'paleografica': 'on', 'critica': 'on'})
        soup = BeautifulSoup(response.content, 'html.parser')

        data = parse_html.extract_metadata(soup, meta_fields, es_month_name_to_number)

        raw_text_soups = soup.find_all(class_='textopaleo')
        data['raw_text'] = ''.join([str(s) for s in raw_text_soups])
        data['text'] = text_process.clean(parse_html.extract_text(raw_text_soups))
        
        with open(corpus_dir / f'{doc_id}.json', 'w') as f:
            json.dump(data, f, indent=4)

else:
    # field_heads = soup.find(id='Tabla_Inventario').thead.find_all('tr')[-1].find_all('th')
    field_heads = doc_list_soup.find(id='Tabla_Inventario_wrapper').find('thead').find_all('tr')[-1] # codea
    ordered_field_names = [field_norm[h.div.text] for h in field_heads]
    pbar = tqdm(doc_list_soup.find(id='Tabla_Inventario').tbody.find_all('tr'))
    for doc_meta in pbar:
        field_values = [f.text.strip() for f in doc_meta.find_all('td')]
        data = dict(zip(ordered_field_names, field_values))
        # roman.fromRoman(data['century'])
        doc_id = data['meta_id']
        pbar.set_description(doc_id)
        if doc_id in docs_to_retrieve: ###
            response = requests.get(docs_url, headers=headers, params={'documento': doc_id, 'paleografica': 'on', 'critica': 'on'})
            soup = BeautifulSoup(response.content, 'html.parser')
            raw_text_soups = soup.find_all(class_='textopaleo')
            data['raw_text'] = ''.join([str(s) for s in raw_text_soups])
            data['text'] = text_process.clean(parse_html.extract_text(raw_text_soups))
            with open(corpus_dir / f'{doc_id}.json', 'w') as f:
                json.dump(data, f, indent=4)

100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.01it/s]


In [13]:
response = requests.get(docs_url, headers=headers, params={'documento': 'SC06_132', 'paleografica': 'on', 'critica': 'on'})
soup = BeautifulSoup(response.content, 'html.parser')
raw_text_soups = soup.find_all(class_='textopaleo')
data = parse_html.extract_metadata(soup, meta_fields, es_month_name_to_number)
# data['raw_text'] = ''.join([str(s) for s in raw_text_soups])
text = parse_html.extract_text(raw_text_soups)

In [14]:
data

{'meta_id': 'SC06_132',
 'format': 'HIPERTEXT',
 'corpus': 'Corpus Mallorca ',
 'unknown_id': 'legajo 6, nº 132, ff. 1r-2r',
 'place': 'Rávena, Emilia-Romaña',
 'date': '1778-06-20',
 'doc_type': 'Cartas privadas',
 'abstract': 'Juan Josep Sales',
 'author': 'Juan Josep Sales (-)',
 'revisors': ['Andrés Enrique-Arias', 'Laura Tudurí Cladera']}

CODCAR-0184 duplicated in charta's inventory for some reason

## Offline

### Santacilia

In [43]:
corpus_src_dir = raw_data / corpus_name / 'Epist' / 'Santacilia'
space_after_newline = corpus_src_dir.parent == corpus_name
for path in tqdm(corpus_src_dir.iterdir()):
    data = parse_docs.parse_epist(path, field_norm)
    raw_text = data.get('raw_text', '')
    data['text'] = text_process.clean(raw_text, space_after_newline=space_after_newline)
    doc_id = data['meta_id']
    # saving with same file name because this has to be unique, since files are stored
    # in same directory. Not necessarily true of given meta_id, which are inconsistent.
    with open(corpus_dir / f'{doc_id}.json', 'w') as f:
        json.dump(data, f, indent=4)
    # have to register this because there is a sizable number of mismatches
    if doc_id != data['file_id']:
        print(f"The ID provided in the header of {path.stem} doesn't match: {doc_id}")

20it [00:00, 90.49it/s]

The ID provided in the header of SC31_194 doesn't match: SC30_194
The ID provided in the header of SC13_094 doesn't match: SC13_94
The ID provided in the header of SC3_018 doesn't match: SC3_18


39it [00:00, 88.53it/s]

The ID provided in the header of SC17_050 doesn't match: SC17_50
The ID provided in the header of SC27_77 doesn't match: SC27_077


55it [00:00, 88.46it/s]


### Rest

In [72]:
corpus_name = 'corpusmallorca'
corpus_dir = interim_data / corpus_name
corpus_src_dir = raw_data / corpus_name
space_after_newline = corpus_src_dir.parent == corpus_name
already_in = set([p.stem for p in corpus_dir.glob('*.json')])
files_to_process = [
    path for path in corpus_src_dir.glob('*.docx')
    if path.stem not in already_in
    # and path.suffix == '.docx'
]

for path in tqdm(files_to_process):
    data = parse_docs.parse(path, field_norm)
    raw_text = data.get('raw_text', '')
    data['text'] = text_process.clean(
        raw_text, space_after_newline=space_after_newline
    )
    doc_id = data['meta_id']
    # saving with same file name because this has to be unique, since files are stored
    # in same directory. Not necessarily true of given meta_id, which are inconsistent.
    with open(corpus_dir / f'{doc_id}.json', 'w') as f:
        json.dump(data, f, indent=4)
    # have to register this because there is a sizable number of mismatches
    if doc_id != data['file_id']:
        print(f"The ID provided in the header of {path.stem} doesn't match: {doc_id}")

100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.70it/s]


In [309]:
records = []
for fname in files_to_process:
    path = corpus_dir / f'{fname[:-5]}.json'
    with open(path) as f:
        records.append(json.load(f))
auto_df = pd.DataFrame.from_records(records)

In [None]:
auto_df.loc[auto_df['file_id'] != auto_df['meta_id']]

differences are mostly slight, rather insignificant ones, and sometimes wrong format / no metadata at all

### PS

TODO: always take original? or take de-abbreviated version, at least for proper nouns? show PSCR7092 eg

In [8]:
path = Path('/home/thomaslouf/Documents/code/cafeconmiel/data/raw/postscriptum/PSCR7092.xml')

In [44]:
corpus_name = 'postscriptum'
corpus_dir = interim_data / corpus_name
corpus_dir.mkdir(exist_ok=True, parents=True)
corpus_src_dir = raw_data / corpus_name
pbar = tqdm(corpus_src_dir.iterdir())
for path in pbar:
    pbar.set_description(path.stem)
    data = parse_docs.parse_ps(path, expand=True, capitalize=False)
    doc_id = data['meta_id']
    with open(corpus_dir / f'{doc_id}.json', 'w') as f:
        json.dump(data, f, indent=4)

PS4102_TEIP5: : 663it [00:17, 36.02it/s]  

dict_keys(['orig', 'reg'])


PSCR6702_TEIP5: : 2446it [01:02, 38.87it/s]


In [10]:
import lxml
tree = lxml.etree.parse(path)

ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

# with open(path) as f:
#     # tree = xml.etree.ElementTree.XML(f.read())

In [43]:
paragraphs = []
possible_parts = ['opener', 'p', 'closer', 'postscript/tei:p']
body = tree.find('tei:text/tei:body', ns)
parts = body.xpath(' | '.join(f'./tei:{p}' for p in possible_parts), namespaces=ns)

for p_elem in parts:
    # p = ''
    # for s_elem in p_elem.findall('./'):
    p = process_paragraph(p_elem, ns)
    paragraphs.append(p)

text = '\n\n'.join(paragraphs)

# Edit pre-processed data

In [54]:
re.sub(r'(\w+)\-\s{,1}\n\s{,1}(\w+)', r'\n\g<1>\g<2>', 'aa-  \ndasd')

'aa-  \ndasd'

coreecom: space_after_newline=False

In [89]:
corpus_name = 'corpuscharta'
# corpus_name = 'corpuscodea'
corpus_dict = corpus_metadata[corpus_name]
corpus_dir = interim_data / corpus_name
corpus_dir.mkdir(exist_ok=True, parents=True)
docs_url = doc_url_patt.format(corpus_base_url=corpus_dict['base_url'])

In [90]:
for path in tqdm(corpus_dir.glob('*.json')):
    with open(path, 'r') as f:
        data = json.load(f)
        if 'text' in data:
            data['text'] = text_process.clean(data['text'], space_after_newline=False)
        # t = parse_html.extract_text(BeautifulSoup(data['raw_text'], 'html.parser'))
        # space_after_newline = not path.stem.startswith('COREECOM')
        # data['text'] = text_process.clean(t, space_after_newline=space_after_newline)
        # # data['year'] = data.pop('date', data.get('year'))
        # # data['doc_type'] = data.pop('doct_type', data.get('doc_type'))
    # break
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

2075it [00:07, 295.80it/s]
