# Regex and Tokenization

For regex, a great website that explains regex is https://regexr.com/

In [None]:
import re

In [None]:
# search for a string
re.search(r"ha", "aloha")

In [None]:
# search for a single character matching one of the choices
re.search(r"[Aa]", "aloha")

In [None]:
# search for a single character in a range
re.search(r"[g-i]", "aloha")

In [None]:
# search for a single character NOT in a range
re.search(r"[^a-g]", "aloha")

In [None]:
# . matches any character, so 5 dots is 5 characters
re.search(r".....", "aloha")

In [None]:
# + searches for 1 or more
re.search(r"h+", "aloha")

In [None]:
# * searches for 0 or more
re.search(r"z*", "aloha")

In [None]:
# findall returns a list of matches
for m in re.findall(r"[a-h]", "aloha"):
    print(m)

In [None]:
# basic email validator
# (can you come up with an email address where it fails?)
if re.match(r"\w+@[a-z]+\.[a-z]", "username@domain.edu"):
    print("valid email!")
else:
    print("invalid")

In [None]:
# capture groups
text = "this and that"
m = re.match(r"(.+) and (.+)", text)
print(m.group(0))  # the whole match
print(m.group(1))  # whatever is in the first set of ()
print(m.group(2))  # whatever is in the first set of ()

In [None]:
# ? matches 0 or 1
print(re.search(r"aloha?", "aloha"))
print(re.search(r"aloha?", "aloh"))

In [None]:
# for 2 or more, use {#}
re.search(r"[aeiou]{2}", "aloha, pehea ʻoe?")

In [None]:
# * and + are greedy, they will try to go until they can't stop
# .+ goes until it hits the last \b (end of the string)
print(re.search(r"I like .+\b", "I like fish and chips"))

# to make it non-greedy, use .+?\b (one or more of any character, until you hit \b)
print(re.search(r"I like .+?\b", "I like fish and chips"))

In [None]:
# extracting things from text
text = "I like fish, she likes meat, and he liked NLP"
for x in re.findall(r"\blike[ds]? (.+?)\b", text):
    print(x)

In [None]:
# Basic lemmatizing
# Try with other suffixes: -ed, -ing, -tion, -er
# Why was lemmatization useful again? 💡
text = "she loves cats and playing playstations"
text = re.sub(r"s\b", "", text)
print(text)

In [None]:
# It's not too hard to write a basic markdown to HTML converter
# (a fully compliant one is more challenging though)
# TODO: write more regex subs to implement the function
def convert_markdown(text):
    html = ""
    for line in text.split("\n"):
        line = re.sub(r"^# (.+)", r"<h1>\1</h1>", line)
        line = re.sub(r"\*\*(.+?)\*\*", r"<b>\1</b>", line)
        html += line
    return html

md = """
# Title
## Subtitle
---
- bullet
- bullet2
*italic* **bold** `code` **bold2**
"""
convert_markdown(md)

## Tokenization

In [None]:
text = "Aloha, pehea ʻoe? Maikaʻi nō!"

# why is this not ideal?
text.split()

In [None]:
# What is wrong with this one?
for m in re.findall(r"\w+", text):
    print(m)

In [None]:
# Here is a better one, but still not perfect
for m in re.split(r"\b", text):
    print(m)

## Off-the-Shelf Tokenizers

Do a `conda install transformers tokenizers` first (and restart your notebook)

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")



In [3]:
sherlock = open("sherlock.txt").read()
text = sherlock[1077:1145]  # try other sections of the text
text

'He never spoke of the softer passions, save with a gibe\nand a sneer.'

In [4]:
output = tokenizer(text)
tokens = tokenizer.tokenize(text)

for tok, id in zip(tokens, output.input_ids):
    print(tok, id, sep='\t')

He	1544
Ġnever	1239
Ġspoke	5158
Ġof	286
Ġthe	262
Ġsofter	32359
Ġpassions	30477
,	11
Ġsave	3613
Ġwith	351
Ġa	257
Ġg	308
ibe	32438
Ċ	198
and	392
Ġa	257
Ġsne	10505
er	263
.	13


## Train your own tokenizer

In [2]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

In [13]:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()  # try commenting out this line

trainer = BpeTrainer()
tokenizer.train(files=["ttc.txt"])






In [14]:
output = tokenizer.encode(text)

In [15]:
output.ids

[217,
 395,
 1209,
 91,
 77,
 8285,
 7936,
 4,
 1649,
 121,
 42,
 971,
 101,
 85,
 42,
 5098,
 83,
 6]

In [16]:
output.tokens

['He',
 'never',
 'spoke',
 'of',
 'the',
 'softer',
 'passions',
 ',',
 'save',
 'with',
 'a',
 'gi',
 'be',
 'and',
 'a',
 'sne',
 'er',
 '.']

In [17]:
for tok, id in zip(output.tokens, output.ids):
    print(tok, id, sep='\t')

He	217
never	395
spoke	1209
of	91
the	77
softer	8285
passions	7936
,	4
save	1649
with	121
a	42
gi	971
be	101
and	85
a	42
sne	5098
er	83
.	6


Feel free to try writing your own BPE tokenizer from scratch! It's not too complicated