Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This script is designed to take the output of InDesign's "Export XML" command and turn it into nicely formatted XML. It also cleans up some common formatting problems along the way. Features: -Fix incredibly stupid InDesign newline encoding (not \n or \cr\n). -Replace simple & codes (" for example) with ascii characters for easier reading. -Fix em and en dash usage! -Get rid of doublespaces
- Loading branch information
Showing
1 changed file
with
62 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: latin-1 -*- | ||
import re | ||
import sys | ||
from optparse import OptionParser | ||
|
||
# Setup options | ||
parser = OptionParser(usage="Usage: %prog [options] files") | ||
parser.add_option('-s', '--safe', action="store_true", dest="safe", | ||
help="Instead of overwriting files, store output to a copy of each file.") | ||
(options, args) = parser.parse_args() | ||
|
||
# Setup general replacements to be made in the text | ||
replacements = [] | ||
|
||
# InDesign encodes apostrophese as ' which isn't actually needed to re-import, so undo it | ||
# to preserve readability | ||
replacements.append( (re.compile(r'\&apos\;'), "'") ); | ||
|
||
# InDesign encodes turns smart quotes into " but we just want dumb quotes for the XML. | ||
replacements.append( (re.compile(r'\"\;'), "\"") ); | ||
|
||
# I'm a stickler for proper en-dash usage. Any numerical range should be an en-dash, not a | ||
# hyphen. | ||
replacements.append( (re.compile(r'(?<=\d)-(?=\d)'), "–") ); | ||
|
||
# Also a stickler for em-dash usage. This catches the common internet patterns of " - " | ||
# and " -- " and replaces them with a proper em-dash | ||
replacements.append( (re.compile(r' -{1,2} '), "—")) | ||
replacements.append( (re.compile(r' — '), "—")) | ||
replacements.append( (re.compile(r' – '), "—")) | ||
|
||
# Adam is the king of the doublespace. Get rid of it. | ||
replacements.append( (re.compile(r' '), " ") ) | ||
|
||
|
||
for filename in args: | ||
infile = open(filename, "r") | ||
output = [] | ||
for line in infile: | ||
fixedline = line | ||
for replacement in replacements: | ||
fixedline = replacement[0].sub(replacement[1], fixedline) | ||
index = 0 | ||
while index < len(fixedline): | ||
if(ord(fixedline[index]) == 226 and ord(fixedline[index+1]) == 128 and ord(fixedline[index+2]) == 169): | ||
output.append("\n") | ||
index += 3 | ||
else: | ||
output.append(fixedline[index]) | ||
index += 1 | ||
infile.close() | ||
|
||
outfile = None | ||
if options.safe: | ||
# This is a really ugly way to do this, and it isn't absolutely safe (file could | ||
# already exist), but it works for now. | ||
outfile = open(".out.".join(filename.split(".")), "w") | ||
else: | ||
outfile = open(filename, "w") | ||
|
||
outfile.write("".join(output)) |