# Adding tokenization and normalization to collation

## Starting point

In [1]:
from collatex import *
collation = Collation()
collation.add_plain_witness( "A", "The quick brown fox jumped over the lazy dog.")
collation.add_plain_witness( "B", "The brown fox jumped over the dog." )
collation.add_plain_witness( "C", "The bad fox jumped over the lazy dog." )
table = collate(collation)
print(table)

+---+-----+-------+-------+---------------------+------+------+
| A | The | quick | brown | fox jumped over the | lazy | dog. |
| B | The | -     | brown | fox jumped over the | -    | dog. |
| C | The | bad   | -     | fox jumped over the | lazy | dog. |
+---+-----+-------+-------+---------------------+------+------+


Separate the text of the witness from its inclusion in the collation.

In [27]:
collation = Collation()
A_content = "The quick brown fox jumped over the lazy dog."
B_content = "The brown fox jumped over the dog."
C_content = "The bad fox jumped over the lazy dog." 
collation.add_plain_witness( "A", A_content )
collation.add_plain_witness( "B", B_content )
collation.add_plain_witness( "C", C_content )
table = collate(collation)
print(table)

+---+-----+-------+-------+---------------------+------+------+
| A | The | quick | brown | fox jumped over the | lazy | dog. |
| B | The | -     | brown | fox jumped over the | -    | dog. |
| C | The | bad   | -     | fox jumped over the | lazy | dog. |
+---+-----+-------+-------+---------------------+------+------+


Use functions to tokenize the witness text. Start with just one witness, and verify the result by outputting JSON.

In [28]:
import re

def tokenize(input):
    return [create_token(token) for token in re.findall('\S+\s*',input)]

def create_token(input):
    return {"t": input}

collation = Collation()
A_content = "The quick brown fox jumped over the lazy dog."
B_content = "The brown fox jumped over the dog."
C_content = "The bad fox jumped over the lazy dog." 

witness_list = []
witness_list.append({"id": "A", "tokens": tokenize(A_content)})

json_input = {"witnesses": witness_list}
print(json_input)

{'witnesses': [{'id': 'A', 'tokens': [{'t': 'The '}, {'t': 'quick '}, {'t': 'brown '}, {'t': 'fox '}, {'t': 'jumped '}, {'t': 'over '}, {'t': 'the '}, {'t': 'lazy '}, {'t': 'dog.'}]}]}


Add simple normalization. This won’t affect the collation output, but we can verify that it’s working.

In [29]:
import re

def normalize(input):
    return input.lower()

def tokenize(input):
    return [create_token(token) for token in re.findall('\S+\s*',input)]

def create_token(input):
    return {"t": input, "n": normalize(input)}

collation = Collation()
A_content = "The quick brown fox jumped over the lazy dog."
B_content = "The brown fox jumped over the dog."
C_content = "The bad fox jumped over the lazy dog." 

witness_list = []
witness_list.append({"id": "A", "tokens": tokenize(A_content)})
witness_list.append({"id": "B", "tokens": tokenize(B_content)})
witness_list.append({"id": "C", "tokens": tokenize(C_content)})

json_input = {"witnesses": witness_list}
print(json_input)
table = collate(json_input)
print(table)

{'witnesses': [{'id': 'A', 'tokens': [{'t': 'The ', 'n': 'the '}, {'t': 'quick ', 'n': 'quick '}, {'t': 'brown ', 'n': 'brown '}, {'t': 'fox ', 'n': 'fox '}, {'t': 'jumped ', 'n': 'jumped '}, {'t': 'over ', 'n': 'over '}, {'t': 'the ', 'n': 'the '}, {'t': 'lazy ', 'n': 'lazy '}, {'t': 'dog.', 'n': 'dog.'}]}, {'id': 'B', 'tokens': [{'t': 'The ', 'n': 'the '}, {'t': 'brown ', 'n': 'brown '}, {'t': 'fox ', 'n': 'fox '}, {'t': 'jumped ', 'n': 'jumped '}, {'t': 'over ', 'n': 'over '}, {'t': 'the ', 'n': 'the '}, {'t': 'dog.', 'n': 'dog.'}]}, {'id': 'C', 'tokens': [{'t': 'The ', 'n': 'the '}, {'t': 'bad ', 'n': 'bad '}, {'t': 'fox ', 'n': 'fox '}, {'t': 'jumped ', 'n': 'jumped '}, {'t': 'over ', 'n': 'over '}, {'t': 'the ', 'n': 'the '}, {'t': 'lazy ', 'n': 'lazy '}, {'t': 'dog.', 'n': 'dog.'}]}]}
+---+-----+-------+-------+---------------------+------+------+
| A | The | quick | brown | fox jumped over the | lazy | dog. |
| B | The | -     | brown | fox jumped over the | -    | dog. |
| C |

Change the text to create a more complex example

In [30]:
import re

def normalize(input):
    return input.lower()

def tokenize(input):
    return [create_token(token) for token in re.findall('\S+\s*',input)]

def create_token(input):
    return {"t": input, "n": normalize(input)}

collation = Collation()
A_content = "Look, a gray koala!"
B_content = "Look, a big grey koala!"
C_content = "Look, a big wombat!" 

witness_list = []
witness_list.append({"id": "A", "tokens": tokenize(A_content)})
witness_list.append({"id": "B", "tokens": tokenize(B_content)})
witness_list.append({"id": "C", "tokens": tokenize(C_content)})

json_input = {"witnesses": witness_list}
# print(json_input)
table = collate(json_input)
print(table)

+---+---------+------+---------+--------+
| A | Look, a | gray | -       | koala! |
| B | Look, a | big  | grey    | koala! |
| C | Look, a | big  | wombat! | -      |
+---+---------+------+---------+--------+


Enhance normalization to recognize that all animals are alike. (This introduces possible complications, which can be addressed through further enhancements.)

In [31]:
import re
import string

def normalize(input):
    input = re.sub('[' + string.punctuation + ']','',input)
    animals = ['koala', 'wombat']
    if input in animals:
        return 'ANIMAL'
    else:
        return input.lower()

def tokenize(input):
    return [create_token(token) for token in re.findall('\S+\s*',input)]

def create_token(input):
    return {"t": input, "n": normalize(input)}

collation = Collation()
A_content = "Look, a gray koala!"
B_content = "Look, a big grey koala!"
C_content = "Look, a big wombat!" 

witness_list = []
witness_list.append({"id": "A", "tokens": tokenize(A_content)})
witness_list.append({"id": "B", "tokens": tokenize(B_content)})
witness_list.append({"id": "C", "tokens": tokenize(C_content)})

json_input = {"witnesses": witness_list}
# print(json_input)
table = collate(json_input)
print(table)

+---+---------+------+------+---------+
| A | Look, a | gray | -    | koala!  |
| B | Look, a | big  | grey | koala!  |
| C | Look, a | big  | -    | wombat! |
+---+---------+------+------+---------+


The animals are now aligned, but the colors aren’t. We can address that through matching:

In [32]:
import re
import string

def normalize(input):
    input = re.sub('[' + string.punctuation + ']','',input)
    animals = ['koala', 'wombat']
    if input in animals:
        return 'ANIMAL'
    else:
        return input.lower()

def tokenize(input):
    return [create_token(token) for token in re.findall('\S+\s*',input)]

def create_token(input):
    return {"t": input, "n": normalize(input)}

collation = Collation()
A_content = "Look, a gray koala!"
B_content = "Look, a big grey koala!"
C_content = "Look, a big wombat!" 

witness_list = []
witness_list.append({"id": "A", "tokens": tokenize(A_content)})
witness_list.append({"id": "B", "tokens": tokenize(B_content)})
witness_list.append({"id": "C", "tokens": tokenize(C_content)})

json_input = {"witnesses": witness_list}
# print(json_input)
table = collate(json_input, near_match=True, segmentation=False)
print(table)

+---+-------+---+-----+------+---------+
| A | Look, | a | -   | gray | koala!  |
| B | Look, | a | big | grey | koala!  |
| C | Look, | a | big | -    | wombat! |
+---+-------+---+-----+------+---------+
