In [1]:
!git clone https://github.com/wjbmattingly/text-analysis-for-ancient-and-medieval-languages

Cloning into 'text-analysis-for-ancient-and-medieval-languages'...
remote: Enumerating objects: 208, done.[K
remote: Counting objects: 100% (208/208), done.[K
remote: Compressing objects: 100% (157/157), done.[K
remote: Total 208 (delta 88), reused 154 (delta 37), pack-reused 0[K
Receiving objects: 100% (208/208), 14.94 MiB | 18.59 MiB/s, done.
Resolving deltas: 100% (88/88), done.


In [2]:
!pip install spacy==3.0.6

Collecting spacy==3.0.6
  Downloading spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 57 kB/s 
[?25hCollecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting thinc<8.1.0,>=8.0.3
  Downloading thinc-8.0.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
[K     |████████████████████████████████| 628 kB 46.5 MB/s 
Collecting catalogue<2.1.0,>=2.0.3
  Downloading catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting spacy-legacy<3.1.0,>=3.0.4
  Downloading spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting pydantic<1.8.0,>=1.7.1
  Downloading pydantic-1.7.4-cp37-cp37m-manylinux2014_x86_64.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 15.7 MB/s 
[?25hCollecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
[K     |████████████████████████████████| 451 kB 62.5 MB/s 
Collecting pathy>=0.

In [3]:
!python -m spacy info

[1m

spaCy version    3.0.6                         
Location         /usr/local/lib/python3.7/dist-packages/spacy
Platform         Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic
Python version   3.7.12                        
Pipelines                                      



In [4]:
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 2.6 MB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import spacy
from spacy.pipeline import EntityRuler
import json
import glob

In [6]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [7]:
def generate_ruler(patterns, name):
    nlp = spacy.blank("en")
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    ruler.to_disk(f"/content/text-analysis-for-ancient-and-medieval-languages/models/{name}_ent_ruler/entity_ruler/patterns.jsonl") 
    nlp.to_disk(f"/content/text-analysis-for-ancient-and-medieval-languages/models/{name}_ent_ruler")

In [8]:
def create_training_data(file, type):
    data = load_data(file)
    patterns = []
    for item in data:
        pattern = {
                    "label": type,
                    "pattern": item
                    }
        patterns.append(pattern)
    return (patterns)

In [9]:
def test_ent_ruler(ruler, corpus):
    nlp = spacy.load(ruler)
    with open (corpus, "r", encoding="utf-8") as f:
        corpus = f.read()
    with open ("/content/text-analysis-for-ancient-and-medieval-languages/temp/results.txt", "w", encoding="utf-8") as f:
        doc = nlp(corpus)
        for ent in doc.ents:
            f.write(f"{ent.text}, {ent.label_}\n")

In [10]:
person_patterns = create_training_data("/content/text-analysis-for-ancient-and-medieval-languages/latin_data/all_names_declined.json", "PERSON")
groups_patterns = create_training_data("/content/text-analysis-for-ancient-and-medieval-languages/latin_data/groups_declined.json", "GROUP")
places_patterns = create_training_data("/content/text-analysis-for-ancient-and-medieval-languages/latin_data/places_declined.json", "LOCATION")

In [11]:
all_patterns = person_patterns+groups_patterns+places_patterns

In [12]:
generate_ruler(all_patterns, "latin_loc_per_group")

In [13]:
test_ent_ruler("/content/text-analysis-for-ancient-and-medieval-languages/models/latin_loc_per_group_ent_ruler", "/content/text-analysis-for-ancient-and-medieval-languages/latin_data/corpus.txt")

In [14]:
def create_training_set(corpus, ent_ruler_model, output_file, prodigy=False):
    nlp=spacy.load(ent_ruler_model)
    TRAIN_DATA = []
    with open (corpus, "r", encoding="utf-8") as f:
        data = f.read()
        segments = data.split("\n")
        for segment in segments:
            segment = segment.strip()
            doc = nlp(segment)
            entities = []
            for ent in doc.ents:
                if prodigy==True:
                    entities.append({"start":ent.start_char, "end": ent.end_char,  "label": ent.label_, "text": ent.text})
                    pass
                else:
                    entities.append((ent.start_char, ent.end_char, ent.label_))
            if len(entities) > 0:
                if prodigy==True:
                    TRAIN_DATA.append({"text": segment, "spans": entities})
                else:
                    TRAIN_DATA.append([segment, {"entities": entities}])
    print (len(TRAIN_DATA))
    with open (output_file, "w", encoding="utf-8") as f:
        json.dump(TRAIN_DATA, f, indent=4)

In [15]:
create_training_set("/content/text-analysis-for-ancient-and-medieval-languages/latin_data/corpus.txt", "/content/text-analysis-for-ancient-and-medieval-languages/models/latin_loc_per_group_ent_ruler", "/content/text-analysis-for-ancient-and-medieval-languages/training_data/training_set_spacy.json", prodigy=False)

388


In [16]:
from spacy.tokens import DocBin

In [17]:
all_docs = load_data("/content/text-analysis-for-ancient-and-medieval-languages/training_data/training_set_spacy.json")

In [18]:
print (all_docs[2])

['[3] His rebus adducti et auctoritate Orgetorigis permoti constituerunt ea quae ad proficiscendum pertinerent comparare, iumentorum et carrorum quam maximum numerum coemere, sementes quam maximas facere, ut in itinere copia frumenti suppeteret, cum proximis civitatibus pacem et amicitiam confirmare. Ad eas res conficiendas biennium sibi satis esse duxerunt; in tertium annum profectionem lege confirmant. Ad eas res conficiendas Orgetorix deligitur. Is sibi legationem ad civitates suscipit. In eo itinere persuadet Castico, Catamantaloedis filio, Sequano, cuius pater regnum in Sequanis multos annos obtinuerat et a senatu populi Romani amicus appellatus erat, ut regnum in civitate sua occuparet, quod pater ante habuerit; itemque Dumnorigi Haeduo, fratri Diviciaci, qui eo tempore principatum in civitate obtinebat ac maxime plebi acceptus erat, ut idem conaretur persuadet eique filiam suam in matrimonium dat. Perfacile factu esse illis probat conata perficere, propterea quod ipse suae civit

In [19]:
train_docs = all_docs[:200]
valid_docs = all_docs[200:]

In [20]:
train_db = DocBin()
from tqdm import tqdm
nlp = spacy.blank("en")
for text, annot in tqdm(train_docs):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            pass
        else:
            ents.append(span)
    doc.ents = ents
    train_db.add(doc)

100%|██████████| 200/200 [00:00<00:00, 342.22it/s]


In [21]:
valid_db = DocBin()
from tqdm import tqdm
nlp = spacy.blank("en")
for text, annot in tqdm(valid_docs):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            pass
        else:
            ents.append(span)
    doc.ents = ents
    train_db.add(doc)

100%|██████████| 188/188 [00:00<00:00, 327.26it/s]


In [22]:
train_db.to_disk("/content/text-analysis-for-ancient-and-medieval-languages/training_data/train_hs.spacy")
valid_db.to_disk("/content/text-analysis-for-ancient-and-medieval-languages/training_data/valid_hs.spacy")

In [24]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [26]:
!python -m spacy train config.cfg --output ./output

[38;5;4mℹ Using CPU[0m
[1m
[2021-11-19 14:39:01,437] [INFO] Set up nlp object from config
[2021-11-19 14:39:01,449] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-11-19 14:39:01,455] [INFO] Created vocabulary
[2021-11-19 14:39:01,455] [INFO] Finished initializing nlp object
[2021-11-19 14:39:03,195] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     81.36    0.00    0.00    0.00    0.00
  0     200         54.14   2052.07    0.00    0.00    0.00    0.00
  1     400         71.68    487.11    0.00    0.00    0.00    0.00
  1     600         86.30    210.84    0.00    0.00    0.00    0.00
  2     800         85.00    175.18    0.00    0.00    0.00    0.00
  2    1000         72.63     92.70    0

In [27]:
nlp = spacy.load("output/model-best")

In [46]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("/content/phi0474.phi056.perseus-lat1.xml"), 'lxml')

In [80]:
def getSpacyEnts(text):
  ent_list = []

  doc = nlp(text)
  for ent in doc.ents:
    ent_list.append((ent.text, ent.label_))
  return ent_list

In [None]:
# for div in soup.find_all('div'):
#   for label in div.find_all('label'):
#     for seg in label.find_all('seg'):
#       if seg.attrs['rend'] == 'dateline':
#         print(getSpacyEnts(seg.text))
#   #print(div.attrs)

In [102]:
letter_dict = {}

for div in soup.find_all('div'):
  try:
    if (div.attrs['subtype'] == 'letter'):
      letter_dict[f'Letter {div.attrs["n"]}'] = div.text#.replace('\n\n', ' ').replace('\n\n\n', ' ')
  except: ## filters out first div tag
    continue

In [105]:
letter_dict['Letter 1']

'\n\nScr. in itinere Patris Alyziam iii Non. Nov. a. 704 (50).\nTVLLIVS TIRONI SVO S. P. D. ET CICERO MEVS ET FRATER ET FRATRIS F.\n\n\nPaulo facilius putavi posse me ferre desiderium tui, sed \n\nplane non fero et, quamquam magni ad honorem nostrum interest quam primum ad urbem me venire, tamen peccasse mihi videor qui a te discesserim ; sed quia tua voluntas ea videbatur esse, ut prorsus nisi confirmato corpore nolles navigare, approbavi tuum consilium neque nunc muto, si\ntu in eadem es sententia ; sin autem, postea quam cibum cepisti, videris tibi posse me consequi, tuum consilium est. Marionem ad te eo misi, ut aut tecum ad me quam primum veniret aut, si tu morarere, statim ad me rediret. \n\n\n\ntu\n\nautem hoc tibi persuade, si commodo valetudinis tuae fieri\npossit, nihil me malle quam te esse mecum ; si autem intelleges opus esse te Patris convalescendi causa paulum commorari, nihil me malle quam te valere.  si statim navigas, nos Leucade consequere ; sin te confirmare vis, et

In [104]:
getSpacyEnts(letter_dict['Letter 1']) ## Needs tuning, but we're getting there!

[('Alyziam', 'PERSON'),
 ('S.', 'PERSON'),
 ('P.', 'PERSON'),
 ('D.', 'PERSON'),
 ('ET', 'PERSON'),
 ('ET', 'GROUP'),
 ('ET', 'GROUP'),
 ('F.', 'PERSON'),
 ('Paulo', 'PERSON'),
 ('Marionem', 'GROUP')]