In [1]:
# Reload all modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [296]:
import os
import re
import json
from urllib.parse import urljoin
from pathlib import Path
import unicodedata
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy

import cafeconmiel.data.metadata as metadata
import cafeconmiel.data.text_process as text_process
import cafeconmiel.data.token_counts as token_counts
import cafeconmiel.utils.paths as paths_utils

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
paths = paths_utils.ProjectPaths()
interim_data = paths.interim_data
raw_data = paths.raw_data
with open(paths.ext_data / 'corpora.json') as f:
    corpus_metadata = json.load(f)

In [4]:
nlp = spacy.load("es_dep_news_trf")

In [14]:
doc = nlp('yo prongo verlo')

In [15]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

yo yo PRON nsubj
prongo pronguir VERB ROOT
verlo ver él VERB xcomp


ID
Documento
Localidad
Año
Forma
Lema
Fenómeno
Categoría morfológica
Cognado	Posición en la palabra
Posición en la sílaba
Contexto fonológico posterior
Presencia en la misma palabra de otra sibilante
Acento en la sílaba en la que se encuentra
Tipo de morfema en el que se localiza
Frecuencia


In [17]:
corpus_name = 'corpusmallorca'
# corpus_name = 'corpuscodea'
corpus_dir = interim_data / corpus_name

In [326]:
records = []
for path in corpus_dir.glob('*.json'):
    with open(path) as f:
        records.append(json.load(f))
doc_df = pd.DataFrame.from_records(records).set_index('meta_id')
doc_df = metadata.normalize(doc_df).groupby('meta_id').first()
doc_df.head()

Unnamed: 0_level_0,file_id,archive,year,locality,region,country,abstract,doc_type,raw_text,text,format,corpus,unknown_id,place,date,author,revisors
meta_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AA001_01,AA001_01,Arxiu del Regne de Mallorca,1702.0,Palma,Islas Baleares,España,El procurador fiscal de la Real Audiencia de M...,informes y relaciones,{h 2r} [lat.: Sup<plicati>o ob<lata>. p<er> Gu...,\nExcelentisimo Señor\n En cumplimiento del o...,,,,,,,
AA001_02,AA001_02,Arxiu del Regne de Mallorca,1702.0,Palma,Islas Baleares,España,Traslado de un documento de 1676 de la reina r...,informes y relaciones,{h 3r} A los Ill<ust>re Marques de la Casta Pa...,A los Illustre Marques de la Casta Pariente m...,,,,,,,
AA001_03,AA001_03,Arxiu del Regne de Mallorca,1704.0,Palma,Islas Baleares,España,El oidor de la Real Audiencia de Mallorca Migu...,informes y relaciones,"{h 77r} [lat: Cusit Pactoris sive mitja, et he...","\nJHS\nCon las mesmas protestaciones, que en ...",,,,,,,
AA001_04,AA001_04,Arxiu del Regne de Mallorca,1704.0,Palma,Islas Baleares,España,El produrador fiscal de la Real Audiencia de M...,informes y relaciones,{h 88r} [lat. V Scedula Oblata per Discretum G...,\nIhs. Acceptando el Procurador Fiscal de la ...,,,,,,,
AA002_01,AA002_01,Arxiu del Regne de Mallorca,1720.0,Palma,Islas Baleares,España,Auto del notario Jaime Martí en que da parte d...,actas y declaraciones,{h 0r} Procedimientos echos por la Júrisdiçion...,Procedimientos echos por la Júrisdiçion Real ...,,,,,,,


In [327]:
years_mask = (doc_df['year'] >= 1700) & (doc_df['year'] < 1800)
words_count_by_doc = token_counts.count_by_doc(doc_df.loc[years_mask])
global_counts = token_counts.doc_counts_to_global(words_count_by_doc)
global_counts = token_counts.word_mask(global_counts, min_df=0, max_df=1.0, upper_th=0.4)
print(global_counts['word_mask'].sum())
normed_words_count_by_doc = token_counts.filter_doc_counts(
    words_count_by_doc, global_counts['word_mask']
)
normed_words_count_by_doc.head()

15231


Unnamed: 0_level_0,Unnamed: 1_level_0,count,word_mask
meta_id,word_lower,Unnamed: 2_level_1,Unnamed: 3_level_1
AA001_01,a,1,True
AA001_01,acuerdo,1,True
AA001_01,alegara,1,True
AA001_01,alijs,1,True
AA001_01,audiat,1,True


In [85]:
import unicodedata
dictionary = set()
with open(paths.ext_data / 'spanish.dic') as f:
    for l in f:
        w = l.strip()
        dictionary.add(w)
        dictionary.add(
            ''.join(
                c for c in unicodedata.normalize('NFD', w)
                if unicodedata.category(c) != 'Mn'
            )
        )
    # dictionary = set('\n'.split(f.read()))

lots of "ss" too, take into account?

In [384]:
list_corrected_forms = []
list_pos = []
for w in global_counts.loc[global_counts['word_mask']].index:
    mod_w, seseo_pos = text_process.seseo_corr(w, dictionary)
    list_corrected_forms.append(mod_w)
    list_pos.append(seseo_pos)

In [397]:
seseos = pd.DataFrame(
    {'correct': list_corrected_forms, 'seseo_pos': list_pos},
    index=global_counts.loc[global_counts['word_mask']].index
)
seseos_by_doc = normed_words_count_by_doc.join(seseos.loc[seseos['correct'].notnull()], how='inner')
new_col_order = seseos_by_doc.columns[1:].tolist() + [seseos_by_doc.columns[0]]
seseos_by_doc = seseos_by_doc[new_col_order]

In [398]:
words_index = seseos_by_doc.index.levels[1]
seseos_by_doc = (
    seseos_by_doc
     .join(pd.Series(words_index.str.len(), index=words_index, name='word_length'))
     .join(global_counts['count'].rename('corpus_count'))
     .rename(columns={'count': 'doc_count'})
     .join(doc_df[['locality', 'year', 'doc_type']])
     .drop(columns='word_mask')
     .astype({'year': int})
)

In [399]:
seseos_by_doc.loc[seseos_by_doc['seseo_pos'].str.len() > 2]

Unnamed: 0_level_0,Unnamed: 1_level_0,correct,seseo_pos,doc_count,word_length,corpus_count,locality,year,doc_type
meta_id,word_lower,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AA001_04,lisensia,licencia,"2, 5",1,8,1,Palma,1704,informes y relaciones
AA002_5,hiziesse,hiciese,"2, 5",1,8,6,Sineu,1720,actas y declaraciones
AA004,persibiesse,percibiese,"3, 8",1,11,1,Palma,1760,actas y declaraciones
AA004,prosessos,procesos,"3, 5",1,9,1,Palma,1760,actas y declaraciones
AA005_11,hisiesse,hiciese,"2, 5",1,8,1,Sa Pobla,1769,actas y declaraciones
AA006_10,compuciesse,compusiese,"5, 8",1,11,1,Felanitx,1769,actas y declaraciones
AA006_10,hiziesse,hiciese,"2, 5",1,8,6,Felanitx,1769,actas y declaraciones
AA006_10,quiciesse,quisiese,"3, 6",1,9,1,Felanitx,1769,actas y declaraciones
AA007_05,hiziesse,hiciese,"2, 5",1,8,6,Montuiri,1771,actas y declaraciones
AA012_01,sircunvesinos,circunvecinos,"0, 8",1,13,2,Esporlas,1778,informes y relaciones


In [400]:
seseos_by_doc.to_csv('seseo_mallorca.csv')

In [383]:
# pos in word, word length, corpus freq
m = list(re.finditer(r'(ss)|(s)|(z)|(c)(?:i|e)', 'acie'))[0]
m.groups()

(None, None, None, 'c')