Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Add formatting correction script

This script is designed to take the output of InDesign's "Export XML" command and turn it into nicely formatted XML. It also cleans up some common formatting problems along the way. Features:
-Fix incredibly stupid InDesign newline encoding (not \n or \cr\n).
-Replace simple & codes (" for example) with ascii characters for easier reading.
-Fix em and en dash usage!
-Get rid of doublespaces
  • Loading branch information...
commit b921d1f22627d4993bdabf9df5bee8b1bfad899c 1 parent 48f7afd
@Sagelt authored
Showing with 62 additions and 0 deletions.
  1. +62 −0 resources/correctformatting.py
View
62 resources/correctformatting.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# -*- coding: latin-1 -*-
+import re
+import sys
+from optparse import OptionParser
+
+# Setup options
+parser = OptionParser(usage="Usage: %prog [options] files")
+parser.add_option('-s', '--safe', action="store_true", dest="safe",
+ help="Instead of overwriting files, store output to a copy of each file.")
+(options, args) = parser.parse_args()
+
+# Setup general replacements to be made in the text
+replacements = []
+
+# InDesign encodes apostrophese as ' which isn't actually needed to re-import, so undo it
+# to preserve readability
+replacements.append( (re.compile(r'\&apos\;'), "'") );
+
+# InDesign encodes turns smart quotes into " but we just want dumb quotes for the XML.
+replacements.append( (re.compile(r'\&quot\;'), "\"") );
+
+# I'm a stickler for proper en-dash usage. Any numerical range should be an en-dash, not a
+# hyphen.
+replacements.append( (re.compile(r'(?<=\d)-(?=\d)'), "") );
+
+# Also a stickler for em-dash usage. This catches the common internet patterns of " - "
+# and " -- " and replaces them with a proper em-dash
+replacements.append( (re.compile(r' -{1,2} '), ""))
+replacements.append( (re.compile(r''), ""))
+replacements.append( (re.compile(r''), ""))
+
+# Adam is the king of the doublespace. Get rid of it.
+replacements.append( (re.compile(r' '), " ") )
+
+
+for filename in args:
+ infile = open(filename, "r")
+ output = []
+ for line in infile:
+ fixedline = line
+ for replacement in replacements:
+ fixedline = replacement[0].sub(replacement[1], fixedline)
+ index = 0
+ while index < len(fixedline):
+ if(ord(fixedline[index]) == 226 and ord(fixedline[index+1]) == 128 and ord(fixedline[index+2]) == 169):
+ output.append("\n")
+ index += 3
+ else:
+ output.append(fixedline[index])
+ index += 1
+ infile.close()
+
+ outfile = None
+ if options.safe:
+ # This is a really ugly way to do this, and it isn't absolutely safe (file could
+ # already exist), but it works for now.
+ outfile = open(".out.".join(filename.split(".")), "w")
+ else:
+ outfile = open(filename, "w")
+
+ outfile.write("".join(output))
Please sign in to comment.
Something went wrong with that request. Please try again.