Normation · VinceMacBuche · Sep 29, 2015 · Sep 15, 2015
diff --git a/tests/unit/test_ncf.py b/tests/unit/test_ncf.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
 
 import unittest
 import ncf

diff --git a/tests/unit/test_ncf_rudder.py b/tests/unit/test_ncf_rudder.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
 
 import unittest
 import ncf
 import ncf_rudder
 import re
 import os.path
+import sys
 import xml.etree.cElementTree as XML
 from pprint import pprint
 
@@ -146,5 +148,24 @@ def test_category_xml_content(self):
   # ncf_rudder.write_technique_for_rudder(path, technique_metadata)
   # ncf_rudder.write_all_techniques_for_rudder(path)
 
+
+  def test_canonify(self):
+    result = ncf_rudder.canonify("ascii @&_ string")
+    self.assertEquals(result, "ascii_____string")
+
+    # python/ncf reads UTF-8 files and produces u'' strings in python2 and '' strings in python3
+    # python2 tests
+    if sys.version_info[0] == 2:
+      # unicode in source file -> interpreted as unicode with u'' -> correct iso in python string (ncf builder use case)
+      result = ncf_rudder.canonify(u'héhé')
+      self.assertEquals(result, 'h__h__')
+
+    # python3 tests
+    if sys.version_info[0] == 3:
+      # unicode in source file -> correct unicode in python string (ncf builder use case)
+      result = ncf_rudder.canonify('héhé')
+      self.assertEquals(result, "h__h__")
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/tools/ncf_rudder.py b/tools/ncf_rudder.py
@@ -71,10 +71,18 @@ def canonify_expected_reports(expected_reports, dest):
 
     # Replace the second field with a canonified version of itself (a la CFEngine)
     fields = line.strip().split(";;")
-    regex = re.compile("[^a-zA-Z0-9_]", flags=re.UNICODE )
-    fields[1] = regex.sub("_", fields[1])
+    fields[1] = canonify(fields[1])
     dest_file.write(";;".join(fields) + "\n")
 
+def canonify(string):
+  # String should be unicode string (ie u'') which is the case if they are read from files opened with encoding="utf-8".
+  # To match cfengine behaviour we need to treat utf8 as if it was ascii (see #7195).
+  # Pure ASCII would provoke an error in python, but any 8 bits encoding that is compatible with ASCII will do
+  # since everything above 127 will be transformed to '_', so we choose arbitrarily "iso-8859-1"
+  string = string.encode("utf-8").decode("iso-8859-1")
+  regex = re.compile("[^a-zA-Z0-9_]")
+  return regex.sub("_", string)
+
 
 # OTHER FUNCTIONS
 #################
@@ -303,10 +311,13 @@ def generate_rudder_reporting(technique):
     generic_method = generic_methods[method_name]
 
     key_value = method_call["args"][generic_method["class_parameter_id"]-1]
-    regex = re.compile("[^\$\{\}\w](?![^{}]+})|\$(?!{)", flags=re.UNICODE)
+    # this regex allows to canonify everything except variables
+    regex = re.compile("[^\$\{\}a-zA-Z0-9_](?![^{}]+})|\$(?!{)")
+    # to match cfengine behaviour we need to treat utf8 as if it was ascii (see #7195)
+    # string should be unicode string (ie u'') which is the case if they are read from files opened with encoding="utf-8"
+    key_value = key_value.encode("utf-8").decode("iso-8859-1") 
     key_value_canonified = regex.sub("_", key_value)
 
-
     class_prefix = generic_method["class_prefix"]+"_"+key_value_canonified
 
     # Always add an empty line for readability