In [1]:
import os

In [2]:
data_folder = "data"
raw_texts = []
for filename in os.listdir(data_folder):
    file_path = os.path.join(data_folder, filename)
    if os.path.isfile(file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                raw_texts.append(f.read())
        except UnicodeDecodeError:
            print(f"Skipped non-text file: {filename}")

print("Number of files read:", len(raw_texts))

Skipped non-text file: .DS_Store
Number of files read: 1


In [3]:
raw_text = "\n".join(raw_texts)
print("Total characters in raw text:", len(raw_text))
print("Sample of raw text:", raw_text[:100]) 

Total characters in raw text: 275153
Sample of raw text: {\rtf1\ansi\ansicpg1252\cocoartf2822
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 


In [4]:
# Now we have multiple options to process the raw text. The first is to split it into words.
import re 

text = "Hi, my name is John. I love programming in Python!"
result = re.split(r'\s', text) # splits on whitespaces

print("Words in the text:", result)

result = re.split(r'([,.]|\s)', text)  # splits on whitespaces and punctuation
print("Words in the text (punctuation removed):", result)

Words in the text: ['Hi,', 'my', 'name', 'is', 'John.', 'I', 'love', 'programming', 'in', 'Python!']
Words in the text (punctuation removed): ['Hi', ',', '', ' ', 'my', ' ', 'name', ' ', 'is', ' ', 'John', '.', '', ' ', 'I', ' ', 'love', ' ', 'programming', ' ', 'in', ' ', 'Python!']


In [5]:
# Now comes the issue of removing whitespaces, to save memory.
result = [word for word in result if word.strip()]
print("Words in the text (whitespaces removed):", result)

Words in the text (whitespaces removed): ['Hi', ',', 'my', 'name', 'is', 'John', '.', 'I', 'love', 'programming', 'in', 'Python!']


In [6]:
# There are many ways to process the text, but we will use this one for now.
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
# Remove tokens that are numbers or contain only *
preprocessed = [item for item in preprocessed if not ('*' in item or item.isdigit())]
print(preprocessed[:30])

['{\\rtf1\\ansi\\ansicpg1252\\cocoartf2822', '\\cocoatextscaling0\\cocoaplatform0{\\fonttbl\\f0\\fswiss\\fcharset0', 'Helvetica', ';', '}', '{\\colortbl', ';', '\\red255\\green255\\blue255', ';', '}', ';', ';', '}', '\\margl1440\\margr1440\\vieww11520\\viewh8400\\viewkind0', '\\pard\\tx720\\tx1440\\tx2160\\tx2880\\tx3600\\tx4320\\tx5040\\tx5760\\tx6480\\tx7200\\tx7920\\tx8640\\pardirnatural\\partightenfactor0', '\\f0\\fs24', '\\cf0', 'Chapter', '1\\', '\\', '\\', '\\', 'In', 'my', 'younger', 'and', 'more', 'vulnerable', 'years', 'my']


In [7]:
print(len(preprocessed), "tokens in the preprocessed text.")

64491 tokens in the preprocessed text.


In [8]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print("Vocabulary size:", vocab_size)

Vocabulary size: 7146


In [9]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [10]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 100:
        break

('!', 0)
('"', 1)
('$3', 2)
('$5', 3)
("'", 4)
('(', 5)
(')', 6)
(',', 7)
('--', 8)
('.', 9)
('00-6', 10)
('00-9', 11)
('12th', 12)
('15-6', 13)
('15-8', 14)
('158th', 15)
('1\\', 16)
('2\\', 17)
('30-4', 18)
('30-5', 19)
('3\\', 20)
('4\\', 21)
('5\\', 22)
('5th', 23)
('6\\', 24)
('7\\', 25)
('8\\', 26)
('9\\', 27)
(':', 28)
(';', 29)
('?', 30)
('A', 31)
('AIN', 32)
('AND', 33)
('ANYbody', 34)
('A\\', 35)
('About', 36)
('Abrams', 37)
('Abruptly', 38)
('Absolutely', 39)
('Across', 40)
('Adam', 41)
('Adriatic\\', 42)
('After', 43)
('After\\', 44)
('Afterward', 45)
('Again', 46)
('Ah', 47)
('Ah-h-h', 48)
('Ahead', 49)
('Albany', 50)
('Albrucksburger', 51)
('All', 52)
('All\\', 53)
('Allied', 54)
('Almost', 55)
('Already', 56)
('Also', 57)
('Amen', 58)
('America', 59)
('American', 60)
('American\\', 61)
('Americans', 62)
('Amid', 63)
('Among', 64)
('An', 65)
('And', 66)
('And\\', 67)
('Angry', 68)
('Another', 69)
('Antoinette', 70)
('Any', 71)
('Anyhow', 72)
('Anything', 73)
('Anywhere', 

In [11]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item for item in preprocessed if not ('*' in item or item.isdigit())]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [12]:
tokenizer = SimpleTokenizerV1(vocab)

text = "In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since."
ids = tokenizer.encode(text)
print(ids)

[483, 4485, 7136, 1239, 4424, 6835, 7120, 4485, 2972, 3263, 4305, 5916, 1145, 6367, 477, 4, 6781, 1495, 6609, 4733, 3746, 4485, 4368, 2824, 5791, 9]


In [13]:
tokenizer.decode(ids)

"In my younger and more vulnerable years my father gave me some advice that I' ve been turning over in my mind ever since."

In [14]:
# This causes an issue when there is a word that is not in the vocabulary. Now we will add a special token for unknown words.

In [15]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|unk|>","<|endoftext|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [16]:
len(vocab.items())

7148

In [17]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('{\\colortbl', 7143)
('{\\rtf1\\ansi\\ansicpg1252\\cocoartf2822', 7144)
('}', 7145)
('<|unk|>', 7146)
('<|endoftext|>', 7147)


In [18]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item for item in preprocessed if not ('*' in item or item.isdigit())]
        ids = [self.str_to_int.get(s, self.str_to_int["<|unk|>"]) for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str.get(i, "<|unk|>") for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [19]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since."
text2 = "This is a test sentence with an unknownword that is not in the vocabulary."
text = "<|endoftext|> ".join([text1, text2])

print(text)

In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since.<|endoftext|> This is a test sentence with an unknownword that is not in the vocabulary.


In [20]:
ids = tokenizer.encode(text)
print(ids)

[483, 4485, 7136, 1239, 4424, 6835, 7120, 4485, 2972, 3263, 4305, 5916, 1145, 6367, 477, 4, 6781, 1495, 6609, 4733, 3746, 4485, 4368, 2824, 5791, 9, 7147, 914, 3911, 1049, 7146, 5629, 7057, 1232, 7146, 6367, 3911, 4582, 3746, 6370, 7146, 9]


In [21]:
text = tokenizer.decode(ids)    
print(text)

In my younger and more vulnerable years my father gave me some advice that I' ve been turning over in my mind ever since. <|endoftext|> This is a <|unk|> sentence with an <|unk|> that is not in the <|unk|>.
