-
Notifications
You must be signed in to change notification settings - Fork 0
/
div_alone.py
93 lines (68 loc) · 2.31 KB
/
div_alone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
""" Div1 / Div2 old Perseus data to Div/Div + RefsDecl converter
Authors : Aaron Plasek, Ariane Pinche, Mark Moll, Ana Migowski
Adaptation : Thibault Clérice
Python 3 Script
Description :
This software will transform old Perseus files into CTS compliant files if their structure is div1/div2 based
Example of file needing this :
https://raw.githubusercontent.com/PerseusDL/canonical-greekLit/598aea1eb719be1709f720839e4428a087e43ad6/data/tlg0612/tlg001/tlg0612.tlg001.perseus-grc1.xml
Example of output :
Syntax :
python3 div1_div2.py [Url of the original file on raw.github] [URN] [lang]
Requires :
- requests
- lxml
- MyCapytain
pip install ...
"""
# Import command line informations
from sys import argv
# Import required library
from lxml import etree
import requests
# Import library for CTS
import MyCapytain.resources.texts.tei
import commontei as common
def transform(url):
""" Download an xml file and add line numbering and ctsize it
:param url: A Perseus Github Raw address
:type url: str
:param urn: The urn of the text
:type urn: str
:param lang: Iso code for lang
:type lang: str
"""
lang, urn, target, parsed = common.parse(url)
if "grc" not in urn and "lat" not in urn:
type_text = "translation"
else:
type_text = "edition"
# We find divs called div1
div1_group = parsed.xpath("//tei:div[@n]", namespaces=common.ns)
i = 1
for div1 in div1_group:
# To deal with different subtype, we get the former attribute value of type and put it to subtype
div1_subtype = div1.get("type")
div1.set("subtype", div1_subtype)
div1.set("type", "textpart")
if "n" not in dict(div1.attrib):
div1.set("n", str(i))
i += 1
"""
Add refsDecl information for CTS
"""
citations = []
citations.append(
MyCapytain.resources.texts.tei.Citation(
name=div1_subtype,
refsDecl="/tei:TEI/tei:text/tei:body/tei:div[@type='"+type_text+"']/tei:div[@n='$1']"
)
)
common.write_and_clean(urn, lang, parsed, citations, target)
try:
print("hi")
except Exception as E:
print(urn + " failed")
print(E)
if __name__ == '__main__':
transform(*tuple(argv[1:]))