This repository has been archived by the owner on Oct 17, 2018. It is now read-only.
/
core.py
1439 lines (1262 loc) · 46.2 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import cgi
import logging
import os
import os.path
import re
from PIL import Image
from lxml import etree
from lxml.etree import XMLSyntaxError
from collections import namedtuple, defaultdict
from zipfile import ZipFile, BadZipfile
from docx2html.exceptions import (
ConversionFailed,
FileNotDocx,
MalformedDocx,
UnintendedTag,
SyntaxNotSupported,
)
DETECT_FONT_SIZE = False
EMUS_PER_PIXEL = 9525
NSMAP = {}
IMAGE_EXTENSIONS_TO_SKIP = ['emf', 'wmf', 'svg']
logger = logging.getLogger(__name__)
###
# Help functions
###
def replace_ext(file_path, new_ext):
"""
>>> replace_ext('one/two/three.four.doc', '.html')
'one/two/three.four.html'
>>> replace_ext('one/two/three.four.DOC', '.html')
'one/two/three.four.html'
>>> replace_ext('one/two/three.four.DOC', 'html')
'one/two/three.four.html'
"""
if not new_ext.startswith(os.extsep):
new_ext = os.extsep + new_ext
index = file_path.rfind(os.extsep)
return file_path[:index] + new_ext
def ensure_tag(tags):
# For some functions we can short-circuit and early exit if the tag is not
# the right kind.
def wrapped(f):
def wrap(*args, **kwargs):
passed_in_tag = args[0]
if passed_in_tag is None:
return None
w_namespace = get_namespace(passed_in_tag, 'w')
valid_tags = [
'%s%s' % (w_namespace, t) for t in tags
]
if passed_in_tag.tag in valid_tags:
return f(*args, **kwargs)
return None
return wrap
return wrapped
def get_namespace(el, namespace):
if namespace not in NSMAP:
NSMAP[namespace] = '{%s}' % el.nsmap[namespace]
return NSMAP[namespace]
def convert_image(target, image_size):
_, extension = os.path.splitext(os.path.basename(target))
# All the image types need to be converted to gif.
invalid_extensions = (
'.bmp',
'.dib',
'.tiff',
'.tif',
)
# Open the image and get the format.
try:
image = Image.open(target)
except IOError:
return target
image_format = image.format
image_file_name = target
# Make sure the size of the image and the size of the embedded image are
# the same.
if image_size is not None and image.size != image_size:
# Resize if needed
try:
image = image.resize(image_size, Image.ANTIALIAS)
except IOError:
pass
# If we have an invalid extension, change the format to gif.
if extension.lower() in invalid_extensions:
image_format = 'GIF'
image_file_name = replace_ext(target, '.gif')
# Resave the image (Post resizing) with the correct format
try:
image.save(image_file_name, image_format)
except IOError:
return target
return image_file_name
@ensure_tag(['p'])
def get_font_size(p, styles_dict):
w_namespace = get_namespace(p, 'w')
r = p.find('%sr' % w_namespace)
if r is None:
return None
rpr = r.find('%srPr' % w_namespace)
if rpr is None:
return None
size = rpr.find('%ssz' % w_namespace)
if size is None:
# Need to get the font size off the styleId
pPr = p.find('%spPr' % w_namespace)
if pPr is None:
return None
pStyle = pPr.find('%spStyle' % w_namespace)
if pStyle is None:
return None
pStyle = pStyle.get('%sval' % w_namespace)
font_size = None
style_value = styles_dict.get(pStyle, None)
if style_value is None:
return None
if 'font_size' in style_value:
font_size = styles_dict[pStyle]['font_size']
while font_size is None:
old_pStyle = pStyle
# If pStyle is not in the styles_dict then we have to break.
if pStyle not in styles_dict:
break
# If based on is not in the styles_dict for pStyle then we have to
# break.
if 'based_on' not in styles_dict[pStyle]:
break
# Try to derive what the font size is based on what the current
# style is based on.
pStyle = styles_dict[pStyle]['based_on']
if old_pStyle == pStyle:
break
# If pStyle is not in styles_dict then break.
if pStyle not in styles_dict:
break
# We have found a new font size
font_size = styles_dict[pStyle]['font_size']
return font_size
return size.get('%sval' % w_namespace)
@ensure_tag(['p'])
def is_natural_header(el, styles_dict):
w_namespace = get_namespace(el, 'w')
pPr = el.find('%spPr' % w_namespace)
if pPr is None:
return False
pStyle = pPr.find('%spStyle' % w_namespace)
if pStyle is None:
return False
style_id = pStyle.get('%sval' % w_namespace)
if (
style_id in styles_dict and
'header' in styles_dict[style_id] and
styles_dict[style_id]['header']):
return styles_dict[style_id]['header']
@ensure_tag(['p'])
def is_header(el, meta_data):
if _is_top_level_upper_roman(el, meta_data):
return 'h2'
el_is_natural_header = is_natural_header(el, meta_data.styles_dict)
if el_is_natural_header:
return el_is_natural_header
if _is_li(el):
return False
w_namespace = get_namespace(el, 'w')
if el.tag == '%stbl' % w_namespace:
return False
# Check to see if this is a header because the font size is different than
# the normal font size.
# Since get_font_size is a method used before meta is created, just pass in
# styles_dict.
if DETECT_FONT_SIZE:
font_size = get_font_size(el, meta_data.styles_dict)
if font_size is not None:
if meta_data.font_sizes_dict[font_size]:
return meta_data.font_sizes_dict[font_size]
# If a paragraph is longer than eight words it is likely not supposed to be
# an h tag.
num_words = len(
etree.tostring(
el,
encoding=unicode,
method='text',
).split(' ')
)
if num_words > 8:
return False
# Check to see if the full line is bold.
whole_line_bold, whole_line_italics = whole_line_styled(el)
if whole_line_bold or whole_line_italics:
return 'h2'
return False
@ensure_tag(['p'])
def _is_top_level_upper_roman(el, meta_data):
w_namespace = get_namespace(el, 'w')
ilvl = get_ilvl(el, w_namespace)
# If this list is not in the root document (indentation of 0), then it
# cannot be a top level upper roman list.
if ilvl != 0:
return False
numId = get_numId(el, w_namespace)
list_type = meta_data.numbering_dict[numId].get(ilvl, False)
return list_type == 'upperRoman'
@ensure_tag(['p'])
def _is_li(el):
return len(el.xpath('.//w:numPr/w:ilvl', namespaces=el.nsmap)) != 0
@ensure_tag(['p'])
def is_li(el, meta_data):
"""
The only real distinction between an ``li`` tag and a ``p`` tag is that an
``li`` tag has an attribute called numPr which holds the list id and ilvl
(indentation level)
"""
if is_header(el, meta_data):
return False
return _is_li(el)
def has_text(p):
"""
It is possible for a ``p`` tag in document.xml to not have any content. If
this is the case we do not want that tag interfering with things like
lists. Detect if this tag has any content.
"""
return '' != etree.tostring(p, encoding=unicode, method='text').strip()
def is_last_li(li, meta_data, current_numId):
"""
Determine if ``li`` is the last list item for a given list
"""
if not is_li(li, meta_data):
return False
w_namespace = get_namespace(li, 'w')
next_el = li
while True:
# If we run out of element this must be the last list item
if next_el is None:
return True
next_el = next_el.getnext()
# Ignore elements that are not a list item
if not is_li(next_el, meta_data):
continue
new_numId = get_numId(next_el, w_namespace)
if current_numId != new_numId:
return True
# If we have gotten here then we have found another list item in the
# current list, so ``li`` is not the last li in the list.
return False
@ensure_tag(['p'])
def get_li_nodes(li, meta_data):
"""
Find consecutive li tags that have content that have the same list id.
"""
yield li
w_namespace = get_namespace(li, 'w')
current_numId = get_numId(li, w_namespace)
starting_ilvl = get_ilvl(li, w_namespace)
el = li
while True:
el = el.getnext()
if el is None:
break
# If the tag has no content ignore it.
if not has_text(el):
continue
# Stop the lists if you come across a list item that should be a
# heading.
if _is_top_level_upper_roman(el, meta_data):
break
if (
is_li(el, meta_data) and
(starting_ilvl > get_ilvl(el, w_namespace))):
break
new_numId = get_numId(el, w_namespace)
if new_numId is None or new_numId == -1:
# Not a p tag or a list item
yield el
continue
# If the list id of the next tag is different that the previous that
# means a new list being made (not nested)
if current_numId != new_numId:
# Not a subsequent list.
break
if is_last_li(el, meta_data, current_numId):
yield el
break
yield el
@ensure_tag(['p'])
def get_ilvl(li, w_namespace):
"""
The ilvl on an li tag tells the li tag at what level of indentation this
tag is at. This is used to determine if the li tag needs to be nested or
not.
"""
ilvls = li.xpath('.//w:ilvl', namespaces=li.nsmap)
if len(ilvls) == 0:
return -1
return int(ilvls[0].get('%sval' % w_namespace))
@ensure_tag(['p'])
def get_numId(li, w_namespace):
"""
The numId on an li tag maps to the numbering dictionary along side the ilvl
to determine what the list should look like (unordered, digits, lower
alpha, etc)
"""
numIds = li.xpath('.//w:numId', namespaces=li.nsmap)
if len(numIds) == 0:
return -1
return numIds[0].get('%sval' % w_namespace)
def create_list(list_type):
"""
Based on the passed in list_type create a list objects (ol/ul). In the
future this function will also deal with what the numbering of an ordered
list should look like.
"""
list_types = {
'bullet': 'ul',
}
el = etree.Element(list_types.get(list_type, 'ol'))
# These are the supported list style types and their conversion to css.
list_type_conversions = {
'decimal': 'decimal',
'decimalZero': 'decimal-leading-zero',
'upperRoman': 'upper-roman',
'lowerRoman': 'lower-roman',
'upperLetter': 'upper-alpha',
'lowerLetter': 'lower-alpha',
'ordinal': 'decimal',
'cardinalText': 'decimal',
'ordinalText': 'decimal',
}
if list_type != 'bullet':
el.set(
'data-list-type',
list_type_conversions.get(list_type, 'decimal'),
)
return el
@ensure_tag(['tc'])
def get_v_merge(tc):
"""
vMerge is what docx uses to denote that a table cell is part of a rowspan.
The first cell to have a vMerge is the start of the rowspan, and the vMerge
will be denoted with 'restart'. If it is anything other than restart then
it is a continuation of another rowspan.
"""
if tc is None:
return None
v_merges = tc.xpath('.//w:vMerge', namespaces=tc.nsmap)
if len(v_merges) != 1:
return None
v_merge = v_merges[0]
return v_merge
@ensure_tag(['tc'])
def get_grid_span(tc):
"""
gridSpan is what docx uses to denote that a table cell has a colspan. This
is much more simple than rowspans in that there is a one-to-one mapping
from gridSpan to colspan.
"""
w_namespace = get_namespace(tc, 'w')
grid_spans = tc.xpath('.//w:gridSpan', namespaces=tc.nsmap)
if len(grid_spans) != 1:
return 1
grid_span = grid_spans[0]
return int(grid_span.get('%sval' % w_namespace))
@ensure_tag(['tr'])
def get_td_at_index(tr, index):
"""
When calculating the rowspan for a given cell it is required to find all
table cells 'below' the initial cell with a v_merge. This function will
return the td element at the passed in index, taking into account colspans.
"""
current = 0
for td in tr.xpath('.//w:tc', namespaces=tr.nsmap):
if index == current:
return td
current += get_grid_span(td)
@ensure_tag(['tbl'])
def get_rowspan_data(table):
w_namespace = get_namespace(table, 'w')
# We need to keep track of what table row we are on as well as which table
# cell we are on.
tr_index = 0
td_index = 0
# Get a list of all the table rows.
tr_rows = list(table.xpath('.//w:tr', namespaces=table.nsmap))
# Loop through each table row.
for tr in table.xpath('.//w:tr', namespaces=table.nsmap):
# Loop through each table cell.
for td in tr.xpath('.//w:tc', namespaces=tr.nsmap):
# Check to see if this cell has a v_merge
v_merge = get_v_merge(td)
# If not increment the td_index and move on
if v_merge is None:
td_index += get_grid_span(td)
continue
# If it does have a v_merge we need to see if it is the ``root``
# table cell (the first in a row to have a rowspan)
# If the value is restart then this is the table cell that needs
# the rowspan.
if v_merge.get('%sval' % w_namespace) == 'restart':
row_span = 1
# Loop through each table row after the current one.
for tr_el in tr_rows[tr_index + 1:]:
# Get the table cell at the current td_index.
td_el = get_td_at_index(tr_el, td_index)
td_el_v_merge = get_v_merge(td_el)
# If the td_ell does not have a v_merge then the rowspan is
# done.
if td_el_v_merge is None:
break
val = td_el_v_merge.get('%sval' % w_namespace)
# If the v_merge is restart then there is another cell that
# needs a rowspan, so the current cells rowspan is done.
if val == 'restart':
break
# Increment the row_span
row_span += 1
yield row_span
# Increment the indexes.
td_index += get_grid_span(td)
tr_index += 1
# Reset the td_index when we finish each table row.
td_index = 0
@ensure_tag(['b', 'i', 'u'])
def style_is_false(style):
"""
For bold, italics and underline. Simply checking to see if the various tags
are present will not suffice. If the tag is present and set to False then
the style should not be present.
"""
if style is None:
return False
w_namespace = get_namespace(style, 'w')
return style.get('%sval' % w_namespace) != 'false'
@ensure_tag(['r'])
def is_bold(r):
"""
The function will return True if the r tag passed in is considered bold.
"""
w_namespace = get_namespace(r, 'w')
rpr = r.find('%srPr' % w_namespace)
if rpr is None:
return False
bold = rpr.find('%sb' % w_namespace)
return style_is_false(bold)
@ensure_tag(['r'])
def is_italics(r):
"""
The function will return True if the r tag passed in is considered
italicized.
"""
w_namespace = get_namespace(r, 'w')
rpr = r.find('%srPr' % w_namespace)
if rpr is None:
return False
italics = rpr.find('%si' % w_namespace)
return style_is_false(italics)
@ensure_tag(['r'])
def is_underlined(r):
"""
The function will return True if the r tag passed in is considered
underlined.
"""
w_namespace = get_namespace(r, 'w')
rpr = r.find('%srPr' % w_namespace)
if rpr is None:
return False
underline = rpr.find('%su' % w_namespace)
return style_is_false(underline)
@ensure_tag(['p'])
def is_title(p):
"""
Certain p tags are denoted as ``Title`` tags. This function will return
True if the passed in p tag is considered a title.
"""
w_namespace = get_namespace(p, 'w')
styles = p.xpath('.//w:pStyle', namespaces=p.nsmap)
if len(styles) == 0:
return False
style = styles[0]
return style.get('%sval' % w_namespace) == 'Title'
@ensure_tag(['r'])
def get_raw_data(r):
"""
It turns out that r tags can contain both t tags and drawing tags. Since we
need both, this function will return them in the order in which they are
found.
"""
w_namespace = get_namespace(r, 'w')
valid_elements = (
'%st' % w_namespace,
'%sdrawing' % w_namespace,
'%spict' % w_namespace,
'%sbr' % w_namespace,
)
for el in r:
if el.tag in valid_elements:
yield el
@ensure_tag(['drawing', 'pict'])
def get_image_id(drawing):
r_namespace = get_namespace(drawing, 'r')
for el in drawing.iter():
# For drawing
image_id = el.get('%sembed' % r_namespace)
if image_id is not None:
return image_id
# For pict
if 'v' not in el.nsmap:
continue
v_namespace = get_namespace(drawing, 'v')
if el.tag == '%simagedata' % v_namespace:
image_id = el.get('%sid' % r_namespace)
if image_id is not None:
return image_id
@ensure_tag(['p'])
def whole_line_styled(p):
"""
Checks to see if the whole p tag will end up being bold or italics. Returns
a tuple (boolean, boolean). The first boolean will be True if the whole
line is bold, False otherwise. The second boolean will be True if the whole
line is italics, False otherwise.
"""
r_tags = p.xpath('.//w:r', namespaces=p.nsmap)
tags_are_bold = [
is_bold(r) or is_underlined(r) for r in r_tags
]
tags_are_italics = [
is_italics(r) for r in r_tags
]
return all(tags_are_bold), all(tags_are_italics)
MetaData = namedtuple(
'MetaData',
[
'numbering_dict',
'relationship_dict',
'styles_dict',
'font_sizes_dict',
'image_handler',
'image_sizes',
],
)
###
# Pre-processing
###
def get_numbering_info(tree):
"""
There is a separate file called numbering.xml that stores how lists should
look (unordered, digits, lower case letters, etc.). Parse that file and
return a dictionary of what each combination should be based on list Id and
level of indentation.
"""
if tree is None:
return {}
w_namespace = get_namespace(tree, 'w')
num_ids = {}
result = defaultdict(dict)
# First find all the list types
for list_type in tree.findall('%snum' % w_namespace):
list_id = list_type.get('%snumId' % w_namespace)
# Each list type is assigned an abstractNumber that defines how lists
# should look.
abstract_number = list_type.find('%sabstractNumId' % w_namespace)
num_ids[abstract_number.get('%sval' % w_namespace)] = list_id
# Loop through all the abstractNumbers
for abstract_number in tree.findall('%sabstractNum' % w_namespace):
abstract_num_id = abstract_number.get('%sabstractNumId' % w_namespace)
# If we find an abstractNumber that is not being used in the document
# then ignore it.
if abstract_num_id not in num_ids:
continue
# Get the level of the abstract number.
for lvl in abstract_number.findall('%slvl' % w_namespace):
ilvl = int(lvl.get('%silvl' % w_namespace))
lvl_format = lvl.find('%snumFmt' % w_namespace)
list_style = lvl_format.get('%sval' % w_namespace)
# Based on the list type and the ilvl (indentation level) store the
# needed style.
result[num_ids[abstract_num_id]][ilvl] = list_style
return result
def get_style_dict(tree):
"""
Some things that are considered lists are actually supposed to be H tags
(h1, h2, etc.) These can be denoted by their styleId
"""
# This is a partial document and actual h1 is the document title, which
# will be displayed elsewhere.
headers = {
'heading 1': 'h2',
'heading 2': 'h3',
'heading 3': 'h4',
'heading 4': 'h5',
'heading 5': 'h6',
'heading 6': 'h6',
'heading 7': 'h6',
'heading 8': 'h6',
'heading 9': 'h6',
'heading 10': 'h6',
}
if tree is None:
return {}
w_namespace = get_namespace(tree, 'w')
result = {}
for el in tree:
style_id = el.get('%sstyleId' % w_namespace)
el_result = {
'header': False,
'font_size': None,
'based_on': None,
}
# Get the header info
name = el.find('%sname' % w_namespace)
if name is None:
continue
value = name.get('%sval' % w_namespace).lower()
if value in headers:
el_result['header'] = headers[value]
# Get the size info.
rpr = el.find('%srPr' % w_namespace)
if rpr is None:
continue
size = rpr.find('%ssz' % w_namespace)
if size is None:
el_result['font_size'] = None
else:
el_result['font_size'] = size.get('%sval' % w_namespace)
# Get based on info.
based_on = el.find('%sbasedOn' % w_namespace)
if based_on is None:
el_result['based_on'] = None
else:
el_result['based_on'] = based_on.get('%sval' % w_namespace)
result[style_id] = el_result
return result
def get_image_sizes(tree):
drawings = []
result = {}
w_namespace = get_namespace(tree, 'w')
for el in tree.iter():
if el.tag == '%sdrawing' % w_namespace:
drawings.append(el)
for d in drawings:
for el in d.iter():
if 'a' not in el.nsmap:
continue
a_namespace = get_namespace(el, 'a')
if el.tag == '%sxfrm' % a_namespace:
ext = el.find('%sext' % a_namespace)
cx = int(ext.get('cx')) / EMUS_PER_PIXEL
cy = int(ext.get('cy')) / EMUS_PER_PIXEL
result[get_image_id(d)] = (cx, cy)
return result
def get_relationship_info(tree, media, image_sizes):
"""
There is a separate file holds the targets to links as well as the targets
for images. Return a dictionary based on the relationship id and the
target.
"""
if tree is None:
return {}
result = {}
# Loop through each relationship.
for el in tree.iter():
el_id = el.get('Id')
if el_id is None:
continue
# Store the target in the result dict.
target = el.get('Target')
if any(
target.lower().endswith(ext) for
ext in IMAGE_EXTENSIONS_TO_SKIP):
continue
if target in media:
image_size = image_sizes.get(el_id)
target = convert_image(media[target], image_size)
# cgi will replace things like & < > with & < >
result[el_id] = cgi.escape(target)
return result
def get_font_sizes_dict(tree, styles_dict):
font_sizes_dict = defaultdict(int)
# Get all the fonts sizes and how often they are used in a dict.
for p in tree.xpath('//w:p', namespaces=tree.nsmap):
# If this p tag is a natural header, skip it
if is_natural_header(p, styles_dict):
continue
if _is_li(p):
continue
font_size = get_font_size(p, styles_dict)
if font_size is None:
continue
font_sizes_dict[font_size] += 1
# Find the most used font size.
most_used_font_size = -1
highest_count = -1
for size, count in font_sizes_dict.items():
if count > highest_count:
highest_count = count
most_used_font_size = size
# Consider the most used font size to be the 'default' font size. Any font
# size that is different will be considered an h tag.
result = {}
for size in font_sizes_dict:
if size is None:
continue
if int(size) > int(most_used_font_size):
# Not an h tag
result[size] = 'h2'
else:
result[size] = None
return result
def _get_document_data(f, image_handler=None):
'''
``f`` is a ``ZipFile`` that is open
Extract out the document data, numbering data and the relationship data.
'''
if image_handler is None:
def image_handler(image_id, relationship_dict):
return relationship_dict.get(image_id)
document_xml = None
numbering_xml = None
relationship_xml = None
styles_xml = None
parser = etree.XMLParser(strip_cdata=False)
path, _ = os.path.split(f.filename)
media = {}
image_sizes = {}
# Loop through the files in the zip file.
for item in f.infolist():
# This file holds all the content of the document.
if item.filename == 'word/document.xml':
xml = f.read(item.filename)
document_xml = etree.fromstring(xml, parser)
# This file tells document.xml how lists should look.
elif item.filename == 'word/numbering.xml':
xml = f.read(item.filename)
numbering_xml = etree.fromstring(xml, parser)
elif item.filename == 'word/styles.xml':
xml = f.read(item.filename)
styles_xml = etree.fromstring(xml, parser)
# This file holds the targets for hyperlinks and images.
elif item.filename == 'word/_rels/document.xml.rels':
xml = f.read(item.filename)
try:
relationship_xml = etree.fromstring(xml, parser)
except XMLSyntaxError:
relationship_xml = etree.fromstring('<xml></xml>', parser)
if item.filename.startswith('word/media/'):
# Strip off the leading word/
media[item.filename[len('word/'):]] = f.extract(
item.filename,
path,
)
# Close the file pointer.
f.close()
# Get dictionaries for the numbering and the relationships.
numbering_dict = get_numbering_info(numbering_xml)
image_sizes = get_image_sizes(document_xml)
relationship_dict = get_relationship_info(
relationship_xml,
media,
image_sizes
)
styles_dict = get_style_dict(styles_xml)
font_sizes_dict = defaultdict(int)
if DETECT_FONT_SIZE:
font_sizes_dict = get_font_sizes_dict(document_xml, styles_dict)
meta_data = MetaData(
numbering_dict=numbering_dict,
relationship_dict=relationship_dict,
styles_dict=styles_dict,
font_sizes_dict=font_sizes_dict,
image_handler=image_handler,
image_sizes=image_sizes,
)
return document_xml, meta_data
###
# HTML Building functions
###
def get_list_data(li_nodes, meta_data):
"""
Build the list structure and return the root list
"""
# Need to keep track of all incomplete nested lists.
ol_dict = {}
# Need to keep track of the current indentation level.
current_ilvl = -1
# Need to keep track of the current list id.
current_numId = -1
# Need to keep track of list that new li tags should be added too.
current_ol = None
# Store the first list created (the root list) for the return value.
root_ol = None
visited_nodes = []
list_contents = []
def _build_li(list_contents):
data = '<br />'.join(t for t in list_contents if t is not None)
return etree.XML('<li>%s</li>' % data)
def _build_non_li_content(el, meta_data):
w_namespace = get_namespace(el, 'w')
if el.tag == '%stbl' % w_namespace:
new_el, visited_nodes = get_table_data(el, meta_data)
return etree.tostring(new_el), visited_nodes
elif el.tag == '%sp' % w_namespace:
return get_p_data(el, meta_data), [el]
if has_text(el):
raise UnintendedTag('Did not expect %s' % el.tag)
def _merge_lists(ilvl, current_ilvl, ol_dict, current_ol):
for i in reversed(range(ilvl, current_ilvl)):
# Any list that is more indented that ilvl needs to
# be merged to the list before it.
if i not in ol_dict:
continue
if ol_dict[i] is not current_ol:
if ol_dict[i] is current_ol:
continue
ol_dict[i][-1].append(current_ol)
current_ol = ol_dict[i]
# Clean up finished nested lists.
for key in list(ol_dict):
if key > ilvl:
del ol_dict[key]
return current_ol
for li_node in li_nodes:
w_namespace = get_namespace(li_node, 'w')
if not is_li(li_node, meta_data):
# Get the content and visited nodes
new_el, el_visited_nodes = _build_non_li_content(
li_node,
meta_data,
)
list_contents.append(new_el)
visited_nodes.extend(el_visited_nodes)
continue
if list_contents:
li_el = _build_li(list_contents)
list_contents = []
current_ol.append(li_el)
# Get the data needed to build the current list item
list_contents.append(get_p_data(
li_node,
meta_data,
))
ilvl = get_ilvl(li_node, w_namespace)
numId = get_numId(li_node, w_namespace)
list_type = meta_data.numbering_dict[numId].get(ilvl, 'decimal')
# If the ilvl is greater than the current_ilvl or the list id is
# changing then we have the first li tag in a nested list. We need to
# create a new list object and update all of our variables for keeping
# track.
if (ilvl > current_ilvl) or (numId != current_numId):
# Only create a new list
ol_dict[ilvl] = create_list(list_type)
current_ol = ol_dict[ilvl]
current_ilvl = ilvl
current_numId = numId
# Both cases above are not True then we need to close all lists greater
# than ilvl and then remove them from the ol_dict
else:
# Merge any nested lists that need to be merged.
current_ol = _merge_lists(
ilvl=ilvl,
current_ilvl=current_ilvl,
ol_dict=ol_dict,
current_ol=current_ol,
)
# Set the root list after the first list is created.
if root_ol is None:
root_ol = current_ol
# Set the current list.
if ilvl in ol_dict:
current_ol = ol_dict[ilvl]
else:
# In some instances the ilvl is not in the ol_dict, if that is the
# case, create it here (not sure how this happens but it has
# before.)
ol_dict[ilvl] = create_list(list_type)
current_ilvl = ilvl
current_numId = numId
current_ol = ol_dict[ilvl]
# Create the li element.