In [27]:
with open("therapy.txt", "r", encoding="utf=8") as f:
    raw_text = f.read()

print("Total number of characters: ", len(raw_text))
print(raw_text[:99])

Total number of characters:  752142


Chapter 1
INTRODUCTION TO
COGNITIVE BEHAVIOR THERAPY
A revolution in the field of mental health w


In [29]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

144078
['Chapter', '1', 'INTRODUCTION', 'TO', 'COGNITIVE', 'BEHAVIOR', 'THERAPY', 'A', 'revolution', 'in', 'the', 'field', 'of', 'mental', 'health', 'was', 'initiated', 'in', 'the', 'early', '1960s', 'by', 'Aaron', 'T', '.', 'Beck', ',', 'MD', ',', 'then']


In [31]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab_size

8895

In [51]:
vocab = {token:integer for integer, token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 500:
        break

('!', 0)
('#1', 1)
('#2', 2)
('#3', 3)
('#4', 4)
('#5', 5)
('#6', 6)
('#7', 7)
('&', 8)
("'", 9)
('(', 10)
(')', 11)
('+', 12)
(',', 13)
('.', 14)
('0', 15)
('0%', 16)
('0–10', 17)
('0–100', 18)
('0–100%', 19)
('0’s', 20)
('1', 21)
('1%', 22)
('10', 23)
('10%', 24)
('10-minute', 25)
('10-point', 26)
('100', 27)
('100%', 28)
('100–101', 29)
('101', 30)
('102', 31)
('102f', 32)
('102–105', 33)
('103', 34)
('104', 35)
('104f', 36)
('104–105', 37)
('105', 38)
('106', 39)
('106–107', 40)
('107', 41)
('107–108', 42)
('107–146', 43)
('108', 44)
('108–110', 45)
('109', 46)
('109–110', 47)
('10:00', 48)
('10:30', 49)
('10th', 50)
('10–11', 51)
('10–15', 52)
('11', 53)
('110', 54)
('110–112', 55)
('111', 56)
('112', 57)
('112–117', 58)
('113', 59)
('113–141', 60)
('114', 61)
('115', 62)
('116', 63)
('116–117', 64)
('116–131', 65)
('117', 66)
('117–118', 67)
('118', 68)
('118–120', 69)
('119', 70)
('119–120', 71)
('11:00', 72)
('11:30', 73)
('11–12', 74)
('12', 75)
('120', 76)
('120–121', 77)
('1

In [65]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}
        #s is token and i is token ID, so flipping it
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
        
        

In [67]:
tokenizer = SimpleTokenizerV1(vocab)

text = """I ask Sally in our first session to enumerate her problems and
set specific goals so both she and I have a shared understanding of what
she is working toward"""
ids = tokenizer.encode(text)
print(ids)

[1674, 3268, 2368, 5450, 6395, 4921, 7433, 8115, 4646, 5260, 6786, 3161, 7441, 7630, 5104, 7573, 3532, 7461, 3161, 1674, 5219, 2925, 7458, 8256, 6308, 8516, 7461, 5685, 8582, 8142]


In [69]:
tokenizer.decode(ids)

'I ask Sally in our first session to enumerate her problems and set specific goals so both she and I have a shared understanding of what she is working toward'

In [75]:
#Special Content Tokens
#|<unk>| and |<endoftext>| - for unknown words and process data in better way

In [77]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
len(vocab.items())

8897

In [79]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('•', 8892)
('\uf090', 8893)
('\uf0d2', 8894)
('<|endoftext|>', 8895)
('<|unk|>', 8896)


In [81]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}
        #s is token and i is token ID, so flipping it
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int
                       else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [93]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, how are you feeling"
text2 = "I dislike therapy."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, how are you feeling <|endoftext|> I dislike therapy.


In [97]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, how are you feeling <|endoftext|> I dislike therapy.'