# PyBo's Tokenizer

### 0. Running the tokenizer

In [None]:
from pybo import BoTokenizer

Instanciate the tokenizer with the 'POS' profile (see [profile documentation](this.file))

In [2]:
tokenizer = BoTokenizer('POS')

Loading Trie...
Time: 2.4239799976348877


Given a random text in Tibetan language,

In [3]:
input_str = '༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།'

Let's see what information can be derived from it.

In [4]:
tokens = tokenizer.tokenize(input_str)
print(f'The output is a {type(tokens)}.\nThe constituting elements are {type(tokens[0])}s.')

The output is a <class 'list'>.
The constituting elements are <class 'pybo.token.Token'>s.


Tokenizing without separating affixed particles is also possible:

In [5]:
not_split = tokenizer.tokenize(input_str, split_affixes=False)

### 1. A first look

#### Non Tibetan tokens

First thing, I see there is non-Tibetan stuff in the middle of the input string. Let's see how I can detect it.

In [6]:
for n, token in enumerate(tokens):
    if token.type == 'non-bo':
        content = token.content
        print(f'"{content}", token number {n+1}, is not Tibetan.')
        start = token.start
        length = token.len
        print(f'this starts at {start}th character in the input and spans {length} characters')

"tr ", token number 4, is not Tibetan.
this starts at 15th character in the input and spans 3 characters


#### Tokens that are not words

Is there any Tibetan punctuation?

In [7]:
for n, token in enumerate(tokens):
    if token.type == 'punct':
        content = token.content
        print(f'"{content}", token number {n+1}, is a punctuation token.')

"༆ ", token number 1, is a punctuation token.
"། ", token number 6, is a punctuation token.
"། ", token number 11, is a punctuation token.
"།། །།", token number 22, is a punctuation token.
"།", token number 24, is a punctuation token.


How are the Tibetan digits treated?

In [8]:
for n, token in enumerate(tokens):
    if token.type == 'num':
        content = token.content
        print(f'"{content}", token number {n+1}, is a numeral.')

"༡༢༣", token number 9, is a numeral.


#### Splitting affixed particles or not:

In [9]:
print(f'splitting them: {tokens[11].content}, {tokens[12].content}')
print(f'keeping them together: {not_split[11].content}')

splitting them: མཐ, འི་
keeping them together: མཐའི་


### 2. The attributes of tokens

Strictly speaking, a token is a word that has been correctly extracted from the input string, but our Token objects have much more information that is awaiting to be exploited by NLP treatments:

#### Token.content – the unmodified content straight from the input string

In [10]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    print(f'{n+1}.\t "{token.content}"')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.	 "༆ "
2.	 "ཤི་"
3.	 "བཀྲ་ཤིས་  "
4.	 "tr "
5.	 "བདེ་་ལེ གས"
6.	 "། "
7.	 "བཀྲ་ཤིས་"
8.	 "བདེ་ལེགས་"
9.	 "༡༢༣"
10.	 "ཀཀ"
11.	 "། "
12.	 "མཐ"
13.	 "འི་"
14.	 "རྒྱ་མཚོ"
15.	 "ར་"
16.	 "གནས་པ"
17.	 "འི་"
18.	 "ཉ"
19.	 "ས་"
20.	 "ཆུ་"
21.	 "འཐུང་"
22.	 "།། །།"
23.	 "མཁའ"
24.	 "།"


#### Token.type – the basic types of tokens 

In [11]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    print(f'{n+1}.\t{token.type}\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.	punct	("༆ ")
2.	syl	("ཤི་")
3.	syl	("བཀྲ་ཤིས་  ")
4.	non-bo	("tr ")
5.	syl	("བདེ་་ལེ གས")
6.	punct	("། ")
7.	syl	("བཀྲ་ཤིས་")
8.	syl	("བདེ་ལེགས་")
9.	num	("༡༢༣")
10.	syl	("ཀཀ")
11.	punct	("། ")
12.	syl	("མཐ")
13.	syl	("འི་")
14.	syl	("རྒྱ་མཚོ")
15.	syl	("ར་")
16.	syl	("གནས་པ")
17.	syl	("འི་")
18.	syl	("ཉ")
19.	syl	("ས་")
20.	syl	("ཆུ་")
21.	syl	("འཐུང་")
22.	punct	("།། །།")
23.	syl	("མཁའ")
24.	punct	("།")


 - syl: contains valid Tibetan syllables
 - num: Tibetan numerals
 - punct: Tibetan punctuation
 - non-bo: non-Tibetan content

#### Token.pos – Part of Speech

In [12]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    print(f'{n+1}.\t{token.pos}\t\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.	punct		("༆ ")
2.	VERB		("ཤི་")
3.	NOUN		("བཀྲ་ཤིས་  ")
4.	non-bo		("tr ")
5.	NOUN		("བདེ་་ལེ གས")
6.	punct		("། ")
7.	NOUN		("བཀྲ་ཤིས་")
8.	NOUN		("བདེ་ལེགས་")
9.	num		("༡༢༣")
10.	non-word		("ཀཀ")
11.	punct		("། ")
12.	NOUN		("མཐ")
13.	PART		("འི་")
14.	NOUN		("རྒྱ་མཚོ")
15.	PART		("ར་")
16.	VERB		("གནས་པ")
17.	PART		("འི་")
18.	NOUN		("ཉ")
19.	PART		("ས་")
20.	NOUN		("ཆུ་")
21.	oov		("འཐུང་")
22.	punct		("།། །།")
23.	NOUN		("མཁའ")
24.	punct		("།")


 - NOUN:    Tibetan noun
 - VERB:    Tibetan verb
 - PART:    casual particle (affixed or not)
 - oov:     Tibetan word for which no POS was found
 - non-word:A sequence of Tibetan letters that does not appear in our list of words
 
 
 - punct:   Tibetan punctuation
 - num:     Tibetan numerals
 - non-bo:  non-Tibetan characters (spaces have a special treatment)

#### Token.tag – Token.pos augmented with morphological information on affixed particles

In [13]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    print(f'{n+1}.\t{token.tag}\t\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.	punct		("༆ ")
2.	VERBᛃᛃᛃ		("ཤི་")
3.	NOUNᛃᛃᛃ		("བཀྲ་ཤིས་  ")
4.	non-bo		("tr ")
5.	NOUNᛃᛃᛃ		("བདེ་་ལེ གས")
6.	punct		("། ")
7.	NOUNᛃᛃᛃ		("བཀྲ་ཤིས་")
8.	NOUNᛃᛃᛃ		("བདེ་ལེགས་")
9.	num		("༡༢༣")
10.	non-word		("ཀཀ")
11.	punct		("། ")
12.	NOUNᛃᛃᛃ		("མཐ")
13.	PARTᛃgiᛃᛃ		("འི་")
14.	NOUNᛃᛃᛃ		("རྒྱ་མཚོ")
15.	PARTᛃlaᛃᛃ		("ར་")
16.	VERBᛃᛃᛃ		("གནས་པ")
17.	PARTᛃgiᛃᛃ		("འི་")
18.	NOUNᛃᛃᛃ		("ཉ")
19.	PARTᛃgisᛃᛃ		("ས་")
20.	NOUNᛃᛃᛃ		("ཆུ་")
21.	oov		("འཐུང་")
22.	punct		("།། །།")
23.	NOUNᛃᛃᛃ		("མཁའ")
24.	punct		("།")


 - la: the ladon(ལ་དོན་) particle was affixed to the previous token
 - gi: the dreldra(འབྲེལ་སྒྲ་) particle was affixed
 - gis: the jedra(བྱེད་སྒྲ་) particle was affixed

note: The runic character "ᛃ" is used as a separator because we assume it won't ever appear besides Tibetan text.

#### Token.lemma – The current word in its canonical form

In [14]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    print(f'{n+1}.\t"{token.lemma}"\t\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.	""		("༆ ")
2.	"ཤི་"		("ཤི་")
3.	"བཀྲ་ཤིས་"		("བཀྲ་ཤིས་  ")
4.	""		("tr ")
5.	"བདེ་ལེགས་"		("བདེ་་ལེ གས")
6.	""		("། ")
7.	"བཀྲ་ཤིས་"		("བཀྲ་ཤིས་")
8.	"བདེ་ལེགས་"		("བདེ་ལེགས་")
9.	""		("༡༢༣")
10.	"ཀཀ་"		("ཀཀ")
11.	""		("། ")
12.	"མཐའ་"		("མཐ")
13.	"གི་"		("འི་")
14.	"རྒྱ་མཚོ་"		("རྒྱ་མཚོ")
15.	"ལ་"		("ར་")
16.	"གནས་པ་"		("གནས་པ")
17.	"གི་"		("འི་")
18.	"ཉ་"		("ཉ")
19.	"གིས་"		("ས་")
20.	"ཆུ་"		("ཆུ་")
21.	"འཐུང་"		("འཐུང་")
22.	""		("།། །།")
23.	"མཁའ་"		("མཁའ")
24.	""		("།")


Only tokens with  have some content in this attribute. The other ones have an empty string.


Token 13 is a ladon(ལ་དོན་) particle that is affixed, so its lemma is the canonical form of this casual particle: "ལ་". The same goes for token 15 and 17. 


The final འ is reconstructed where necessary (token 12).


When we have a lemma for a given word in our list, we provide it, such as for token 5, but otherwise, we chose to give the normalized version of the content, such as in token 10.

#### Token.cleaned_content – the normalized form of Token.content

In [15]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    print(f'{n+1}.\t"{token.cleaned_content}"\t\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.	""		("༆ ")
2.	"ཤི་"		("ཤི་")
3.	"བཀྲ་ཤིས་"		("བཀྲ་ཤིས་  ")
4.	""		("tr ")
5.	"བདེ་ལེགས་"		("བདེ་་ལེ གས")
6.	""		("། ")
7.	"བཀྲ་ཤིས་"		("བཀྲ་ཤིས་")
8.	"བདེ་ལེགས་"		("བདེ་ལེགས་")
9.	""		("༡༢༣")
10.	"ཀཀ་"		("ཀཀ")
11.	""		("། ")
12.	"མཐ"		("མཐ")
13.	"འི་"		("འི་")
14.	"རྒྱ་མཚོ"		("རྒྱ་མཚོ")
15.	"ར་"		("ར་")
16.	"གནས་པ"		("གནས་པ")
17.	"འི་"		("འི་")
18.	"ཉ"		("ཉ")
19.	"ས་"		("ས་")
20.	"ཆུ་"		("ཆུ་")
21.	"འཐུང་"		("འཐུང་")
22.	""		("།། །།")
23.	"མཁའ་"		("མཁའ")
24.	""		("།")


1. The different Unicode spaces and tabs are removed, 
2. Insecable tseks are replaced with regular tseks.
2. Tseks are added at the end of every syllable (not at the end of every token)


See for example in token 5 that the double tsek is reduced and that a tsek is added at the end of the second syllable.
On the other hand, tokens 12, 14 and 16 don't end with a tsek since their last syllable ends in the following token.

note: as of now, the normalization of punctuation is not implemented.

#### Token.unaffixed_word – Token.cleaned_content augmented with the འ reinsertion

In [16]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    print(f'{n+1}.\t"{token.unaffixed_word}"\t\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.	""		("༆ ")
2.	"ཤི་"		("ཤི་")
3.	"བཀྲ་ཤིས་"		("བཀྲ་ཤིས་  ")
4.	""		("tr ")
5.	"བདེ་ལེགས་"		("བདེ་་ལེ གས")
6.	""		("། ")
7.	"བཀྲ་ཤིས་"		("བཀྲ་ཤིས་")
8.	"བདེ་ལེགས་"		("བདེ་ལེགས་")
9.	""		("༡༢༣")
10.	"ཀཀ་"		("ཀཀ")
11.	""		("། ")
12.	"མཐའ་"		("མཐ")
13.	"འི་"		("འི་")
14.	"རྒྱ་མཚོ་"		("རྒྱ་མཚོ")
15.	"ར་"		("ར་")
16.	"གནས་པ་"		("གནས་པ")
17.	"འི་"		("འི་")
18.	"ཉ་"		("ཉ")
19.	"ས་"		("ས་")
20.	"ཆུ་"		("ཆུ་")
21.	"འཐུང་"		("འཐུང་")
22.	""		("།། །།")
23.	"མཁའ་"		("མཁའ")
24.	""		("།")


When tokens contain an affixed particle, the unaffixed form is reconstructed.
འ is reinserted in token 12, but not in token 16, nor 14, nor 18.


This also functions when we choose not to separate affixed particles from their hosting word:

In [17]:
for n, token in enumerate(not_split):
    print(f'{n+1}.\t"{token.unaffixed_word}"\t\t("{token.content}")')

1.	""		("༆ ")
2.	"ཤི་"		("ཤི་")
3.	"བཀྲ་ཤིས་"		("བཀྲ་ཤིས་  ")
4.	""		("tr ")
5.	"བདེ་ལེགས་"		("བདེ་་ལེ གས")
6.	""		("། ")
7.	"བཀྲ་ཤིས་"		("བཀྲ་ཤིས་")
8.	"བདེ་ལེགས་"		("བདེ་ལེགས་")
9.	""		("༡༢༣")
10.	"ཀཀ་"		("ཀཀ")
11.	""		("། ")
12.	"མཐའ་"		("མཐའི་")
13.	"རྒྱ་མཚོ་"		("རྒྱ་མཚོར་")
14.	"གནས་པ་"		("གནས་པའི་")
15.	"ཉ་"		("ཉས་")
16.	"ཆུ་"		("ཆུ་")
17.	"འཐུང་"		("འཐུང་")
18.	""		("།། །།")
19.	"མཁའ་"		("མཁའ")
20.	""		("།")


#### Token.affix & Token.affixed – Host-word and its affixed particle

In [18]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    if token.affix: # boolean value: True
        print(f'{n+1}.\tAffix\t\t("{token.content}")')
    elif token.affixed:
        print(f'{n+1}.\tHost\t\t("{token.content}")')
    else:
        print(f'{n+1}.\t\t\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.			("༆ ")
2.			("ཤི་")
3.			("བཀྲ་ཤིས་  ")
4.			("tr ")
5.			("བདེ་་ལེ གས")
6.			("། ")
7.			("བཀྲ་ཤིས་")
8.			("བདེ་ལེགས་")
9.			("༡༢༣")
10.			("ཀཀ")
11.			("། ")
12.	Host		("མཐ")
13.	Affix		("འི་")
14.	Host		("རྒྱ་མཚོ")
15.	Affix		("ར་")
16.	Host		("གནས་པ")
17.	Affix		("འི་")
18.	Host		("ཉ")
19.	Affix		("ས་")
20.			("ཆུ་")
21.			("འཐུང་")
22.			("།། །།")
23.			("མཁའ")
24.			("།")


#### Token.aa_word – Signals words that end with འ 

In [19]:
print(f'"{input_str}"\n')
for n, token in enumerate(tokens):
    if token.aa_word: # boolean value: True
        print(f'{n+1}.\tTrue\t\t("{token.content}")')
    else:
        print(f'{n+1}.\t\t\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.			("༆ ")
2.			("ཤི་")
3.			("བཀྲ་ཤིས་  ")
4.			("tr ")
5.			("བདེ་་ལེ གས")
6.			("། ")
7.			("བཀྲ་ཤིས་")
8.			("བདེ་ལེགས་")
9.			("༡༢༣")
10.			("ཀཀ")
11.			("། ")
12.	True		("མཐ")
13.			("འི་")
14.			("རྒྱ་མཚོ")
15.			("ར་")
16.			("གནས་པ")
17.			("འི་")
18.			("ཉ")
19.			("ས་")
20.			("ཆུ་")
21.			("འཐུང་")
22.			("།། །།")
23.			("མཁའ")
24.			("།")


note: This is currently not detected in words not containing affixed particles, such as token 23.

#### Token.syls – Individual syllables of every token

In [20]:
print(f'"{input_str}"\n')
for n, token in enumerate(not_split):
    print(f'{n+1}.\t{token.syls}\t\t\t\t("{token.content}")')

"༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"

1.	None				("༆ ")
2.	[[0, 1]]				("ཤི་")
3.	[[0, 1, 2], [4, 5, 6]]				("བཀྲ་ཤིས་  ")
4.	None				("tr ")
5.	[[0, 1, 2], [5, 6, 8, 9]]				("བདེ་་ལེ གས")
6.	None				("། ")
7.	[[0, 1, 2], [4, 5, 6]]				("བཀྲ་ཤིས་")
8.	[[0, 1, 2], [4, 5, 6, 7]]				("བདེ་ལེགས་")
9.	None				("༡༢༣")
10.	[[0, 1]]				("ཀཀ")
11.	None				("། ")
12.	[[0, 1, 2, 3]]				("མཐའི་")
13.	[[0, 1, 2], [4, 5, 6, 7]]				("རྒྱ་མཚོར་")
14.	[[0, 1, 2], [4, 5, 6]]				("གནས་པའི་")
15.	[[0, 1]]				("ཉས་")
16.	[[0, 1]]				("ཆུ་")
17.	[[0, 1, 2, 3]]				("འཐུང་")
18.	None				("།། །།")
19.	[[0, 1, 2]]				("མཁའ")
20.	None				("།")


Tokens containing no syllabe have "None" as value for this attribute.


For the others, every syllable is represented as a list containing indices.

The indices are relative to the beginning of the current token (Token.start attribute)
Each index corresponds to a letter of the syllabe (spaces and tseks are omitted).


Here is how we can make use of them to get a cleaned syllable using this attribute and the original string (input_str):

In [21]:
for n, token in enumerate(not_split):
    if token.syls:
        syls_in_list_of_chars = []
        for s in token.syls:
            syls_in_list_of_chars.append([input_str[token.start + a] for a in s])
        syls_in_list = [''.join(a) for a in syls_in_list_of_chars]
        clean_content = '་'.join(syls_in_list) + '་'
        print(f'{n+1}.\t{clean_content}\t\t<- {syls_in_list}\t\t<- {syls_in_list_of_chars}')
    else:
        print(f'{n+1}.')

1.
2.	ཤི་		<- ['ཤི']		<- [['ཤ', 'ི']]
3.	བཀྲ་ཤིས་		<- ['བཀྲ', 'ཤིས']		<- [['བ', 'ཀ', 'ྲ'], ['ཤ', 'ི', 'ས']]
4.
5.	བདེ་ལེགས་		<- ['བདེ', 'ལེགས']		<- [['བ', 'ད', 'ེ'], ['ལ', 'ེ', 'ག', 'ས']]
6.
7.	བཀྲ་ཤིས་		<- ['བཀྲ', 'ཤིས']		<- [['བ', 'ཀ', 'ྲ'], ['ཤ', 'ི', 'ས']]
8.	བདེ་ལེགས་		<- ['བདེ', 'ལེགས']		<- [['བ', 'ད', 'ེ'], ['ལ', 'ེ', 'ག', 'ས']]
9.
10.	ཀཀ་		<- ['ཀཀ']		<- [['ཀ', 'ཀ']]
11.
12.	མཐའི་		<- ['མཐའི']		<- [['མ', 'ཐ', 'འ', 'ི']]
13.	རྒྱ་མཚོར་		<- ['རྒྱ', 'མཚོར']		<- [['ར', 'ྒ', 'ྱ'], ['མ', 'ཚ', 'ོ', 'ར']]
14.	གནས་པའི་		<- ['གནས', 'པའི']		<- [['ག', 'ན', 'ས'], ['པ', 'འ', 'ི']]
15.	ཉས་		<- ['ཉས']		<- [['ཉ', 'ས']]
16.	ཆུ་		<- ['ཆུ']		<- [['ཆ', 'ུ']]
17.	འཐུང་		<- ['འཐུང']		<- [['འ', 'ཐ', 'ུ', 'ང']]
18.
19.	མཁའ་		<- ['མཁའ']		<- [['མ', 'ཁ', 'འ']]
20.


#### Token.char_types – General categorization of characters

In [22]:
for n, token in enumerate(tokens):
    print(f'{n+1}.', end=' ')
    for m, t in enumerate(token.char_types):
        print(f"'{token.content[m]}':{t}", end=', ')
    print()

1. '༆':punct, ' ':space, 
2. 'ཤ':cons, 'ི':vow, '་':tsek, 
3. 'བ':cons, 'ཀ':cons, 'ྲ':sub-cons, '་':tsek, 'ཤ':cons, 'ི':vow, 'ས':cons, '་':tsek, ' ':space, ' ':space, 
4. 't':other, 'r':other, ' ':space, 
5. 'བ':cons, 'ད':cons, 'ེ':vow, '་':tsek, '་':tsek, 'ལ':cons, 'ེ':vow, ' ':space, 'ག':cons, 'ས':cons, 
6. '།':punct, ' ':space, 
7. 'བ':cons, 'ཀ':cons, 'ྲ':sub-cons, '་':tsek, 'ཤ':cons, 'ི':vow, 'ས':cons, '་':tsek, 
8. 'བ':cons, 'ད':cons, 'ེ':vow, '་':tsek, 'ལ':cons, 'ེ':vow, 'ག':cons, 'ས':cons, '་':tsek, 
9. '༡':num, '༢':num, '༣':num, 
10. 'ཀ':cons, 'ཀ':cons, 
11. '།':punct, ' ':space, 
12. 'མ':cons, 'ཐ':cons, 
13. 'འ':cons, 'ི':vow, '་':tsek, 
14. 'ར':cons, 'ྒ':sub-cons, 'ྱ':sub-cons, '་':tsek, 'མ':cons, 'ཚ':cons, 'ོ':vow, 
15. 'ར':cons, '་':tsek, 
16. 'ག':cons, 'ན':cons, 'ས':cons, '་':tsek, 'པ':cons, 
17. 'འ':cons, 'ི':vow, '་':tsek, 
18. 'ཉ':cons, 
19. 'ས':cons, '་':tsek, 
20. 'ཆ':cons, 'ུ':vow, '་':tsek, 
21. 'འ':cons, 'ཐ':cons, 'ུ':vow, 'ང':cons, '་':tsek, 
22. '།':punct, '།':pu