In [13]:
with open("therapy.txt", "r", encoding="utf=8") as f:
    raw_text = f.read()

print("Total number of characters: ", len(raw_text))
print(raw_text[:99])

Total number of characters:  752141
Chapter 1
INTRODUCTION TO
COGNITIVE BEHAVIOR THERAPY
A revolution in the fi eld of mental health wa


In [15]:
import re
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

144079
['Chapter', '1', 'INTRODUCTION', 'TO', 'COGNITIVE', 'BEHAVIOR', 'THERAPY', 'A', 'revolution', 'in', 'the', 'fi', 'eld', 'of', 'mental', 'health', 'was', 'initiated', 'in', 'the', 'early', '1960s', 'by', 'Aaron', 'T', '.', 'Beck', ',', 'MD', ',']


In [19]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab_size

8896

In [20]:
vocab = {token:integer for integer, token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 500:
        break

('!', 0)
('#1', 1)
('#2', 2)
('#3', 3)
('#4', 4)
('#5', 5)
('#6', 6)
('#7', 7)
('&', 8)
("'", 9)
('(', 10)
(')', 11)
('+', 12)
(',', 13)
('.', 14)
('0', 15)
('0%', 16)
('0–10', 17)
('0–100', 18)
('0–100%', 19)
('0’s', 20)
('1', 21)
('1%', 22)
('10', 23)
('10%', 24)
('10-minute', 25)
('10-point', 26)
('100', 27)
('100%', 28)
('100–101', 29)
('101', 30)
('102', 31)
('102f', 32)
('102–105', 33)
('103', 34)
('104', 35)
('104f', 36)
('104–105', 37)
('105', 38)
('106', 39)
('106–107', 40)
('107', 41)
('107–108', 42)
('107–146', 43)
('108', 44)
('108–110', 45)
('109', 46)
('109–110', 47)
('10:00', 48)
('10:30', 49)
('10th', 50)
('10–11', 51)
('10–15', 52)
('11', 53)
('110', 54)
('110–112', 55)
('111', 56)
('112', 57)
('112–117', 58)
('113', 59)
('113–141', 60)
('114', 61)
('115', 62)
('116', 63)
('116–117', 64)
('116–131', 65)
('117', 66)
('117–118', 67)
('118', 68)
('118–120', 69)
('119', 70)
('119–120', 71)
('11:00', 72)
('11:30', 73)
('11–12', 74)
('12', 75)
('120', 76)
('120–121', 77)
('1

In [21]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}
        #s is token and i is token ID, so flipping it
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
        
        

In [22]:
tokenizer = SimpleTokenizerV1(vocab)

text = """I ask Sally in our first session to enumerate her problems and
set specific goals so both she and I have a shared understanding of what
she is working toward"""
ids = tokenizer.encode(text)
print(ids)

[1674, 3268, 2368, 5451, 6396, 4922, 7434, 8116, 4647, 5261, 6787, 3161, 7442, 7631, 5105, 7574, 3532, 7462, 3161, 1674, 5220, 2925, 7459, 8257, 6309, 8517, 7462, 5686, 8583, 8143]


In [23]:
tokenizer.decode(ids)

'I ask Sally in our first session to enumerate her problems and set specific goals so both she and I have a shared understanding of what she is working toward'

In [24]:
#Special Content Tokens
#|<unk>| and |<endoftext>| - for unknown words and process data in better way

In [25]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
len(vocab.items())

8898

In [26]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('•', 8893)
('\uf090', 8894)
('\uf0d2', 8895)
('<|endoftext|>', 8896)
('<|unk|>', 8897)


In [27]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}
        #s is token and i is token ID, so flipping it
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int
                       else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [28]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, how are you feeling"
text2 = "I dislike therapy."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, how are you feeling <|endoftext|> I dislike therapy.


In [29]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, how are you feeling <|endoftext|> I dislike therapy.'

In [30]:
#Byte Pair Encoding

In [31]:
!pip3 install tiktoken

Collecting tiktoken
  Obtaining dependency information for tiktoken from https://files.pythonhosted.org/packages/4d/ae/4613a59a2a48e761c5161237fc850eb470b4bb93696db89da51b79a871f1/tiktoken-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl.metadata
  Downloading tiktoken-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [32]:
import importlib
import tiktoken

In [33]:
tokenizer = tiktoken.get_encoding("gpt2")

In [34]:
text = ("Hello, how are you feeling? <|endoftext|> I love flowers")
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 703, 389, 345, 4203, 30, 220, 50256, 314, 1842, 12734]


In [35]:
#Creating input target pairs

In [36]:
with open("therapy.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

186730


In [37]:
context_size = 4

x = enc_text[:context_size]
y = enc_text[1:context_size+1]

print(x, y)

[14126, 352, 198, 1268] [352, 198, 1268, 5446]


In [38]:
for i in range(1, context_size+1):
    context = enc_text[:i]
    desired = enc_text[i]

    print(context,"--->", desired)

[14126] ---> 352
[14126, 352] ---> 198
[14126, 352, 198] ---> 1268
[14126, 352, 198, 1268] ---> 5446


In [39]:
#Implement a Data Loader

In [40]:
!pip3 install torch
from torch.utils.data import Dataset, DataLoader

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/3f/14/e105b8ef6d324e789c1589e95cb0ab63f3e07c2216d68b1178b7c21b7d2a/torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata
  Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Obtaining dependency information for typing-extensions>=4.8.0 from https://files.pythonhosted.org/packages/69/e0/552843e0d356fbb5256d21449fa957fa4eff3bbc135a74a691ee70c7c5da/typing_extensions-4.14.0-py3-none-any.whl.metadata
  Downloading typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl (150.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.8/150.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hUsing cached typing_extensions-4.14.0-py3-none-any.whl (43 kB)
Installing collected packages: typing-extensions, torch
  Attemptin

In [41]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [42]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [43]:
#Testing Dataloader

In [45]:
with open("therapy.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [46]:
import torch
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)

#more structured data loader
print(first_batch)

[tensor([[14126,   352,   198,  1268]]), tensor([[ 352,  198, 1268, 5446]])]


In [48]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("Targets:\n", targets)

Inputs:
 tensor([[14126,   352,   198,  1268],
        [ 5446, 28644,  2849,  5390],
        [  198,    34,  7730,    45],
        [ 2043,  9306,  9348,  7801],
        [12861,  1581,  2320,  1137],
        [ 2969,    56,   198,    32],
        [ 5854,   287,   262, 25912],
        [18441,   286,  5110,  1535]])
Targets:
 tensor([[  352,   198,  1268,  5446],
        [28644,  2849,  5390,   198],
        [   34,  7730,    45,  2043],
        [ 9306,  9348,  7801, 12861],
        [ 1581,  2320,  1137,  2969],
        [   56,   198,    32,  5854],
        [  287,   262, 25912, 18441],
        [  286,  5110,  1535,   373]])


In [49]:
#Token Embeddings - assigning values to words (aka vector embeddings)

In [50]:
import gensim.downloader as api

In [51]:
model = api.load("word2vec-google-news-300")



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [52]:
word_vectors = model

print(word_vectors['computer'])

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

In [54]:
print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10))

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593831062317), ('monarchy', 0.5087411999702454)]
