In [2]:
!pip install tiktoken
import tiktoken
import torch
import numpy as np
import torch.nn as nn

from torch.nn import functional as F




In [3]:
torch.manual_seed(256)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

block_size        = 40      ## N tokens in sequence
batch_size        = 64
max_iters         = 6000
eval_interval     = 500
learning_rate     = 0.0003
eval_iters        = 300
vocab_size        = 65

## every id for a given token is embedded to vector of this size
n_embd            = 512
n_head            = 8         ## 8 attention heads
n_layer           = 6         ## 6 eoncoder layers
dropout           = 0.2


In [4]:
from google.colab import files
uploaded = files.upload()

with open('cats_and_dogs_data.txt', 'r', encoding='utf-8') as f:
    text = f.read().split('.')



Saving cats_and_dogs_data.txt to cats_and_dogs_data.txt


In [5]:
# Initialize tiktoken encoder/decoder
tokenizer = tiktoken.get_encoding("cl100k_base")  # using the GPT-3 base encoding
encode = tokenizer.encode
decode = tokenizer.decode



In [6]:
text = ''

input_file2 = 'cats_and_dogs_data.txt'

with open(input_file2, 'r', encoding='utf-8') as f:
    text = f.read()
print(f"Loaded text length: {len(text)}")
print(f"Sample content: {text[:500]}")
# Tokenize the text
data = torch.tensor(encode(text), dtype=torch.long)


Loaded text length: 381381
Sample content: URL: https://veterinarypartner.vin.com/default.aspx?pId=19239&catId=102887

Found target section:    Cats
Diseases and Conditions
Allergies & Immune System
Adverse Reactions to Spot-on Flea and Tick Products
With spot-on preventives, there is the possibility that certain individuals will have adverse local reactions to one or more ingredients in the product.
Allergic Conjunctivitis in Dogs and Cats
Allergic conjunctivitis is most commonly treated with eyedrops or ointments containing corticoster


In [7]:
print("length of data in letter or characters")
len(text)


length of data in letter or characters


381381

In [8]:
list(set(text))


['/',
 'I',
 '–',
 'e',
 'z',
 'F',
 'T',
 'G',
 'y',
 'R',
 '’',
 'Q',
 'i',
 '\n',
 '"',
 'P',
 'g',
 '0',
 'A',
 'B',
 '7',
 '8',
 '!',
 'S',
 '&',
 'r',
 '®',
 'E',
 '1',
 'o',
 '9',
 ',',
 'L',
 ' ',
 'H',
 ':',
 '”',
 'Y',
 '-',
 'd',
 'x',
 '?',
 '5',
 ';',
 '6',
 'c',
 'p',
 'O',
 '2',
 '“',
 'U',
 'l',
 'k',
 '4',
 '.',
 'j',
 'f',
 't',
 'b',
 'C',
 'N',
 '3',
 'a',
 'D',
 "'",
 'u',
 'V',
 '=',
 '%',
 '—',
 'w',
 'n',
 'M',
 ')',
 'q',
 'h',
 'm',
 '\xa0',
 '™',
 'Z',
 'v',
 's',
 'J',
 'W',
 'K',
 '(']

In [9]:
the_chars  = sorted(     list(set(text))     )

vocab_size = len( the_chars )      ## 65

print(  len(the_chars)  )

print(  ''.join(the_chars)  )


86

 !"%&'(),-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWYZabcdefghijklmnopqrstuvwxyz ®–—’“”™


In [10]:
stoi = { ch:i for i, ch in enumerate(the_chars) }
itos = { i:ch for i, ch in enumerate(the_chars) }



In [11]:
print( stoi )
print( itos )



{'\n': 0, ' ': 1, '!': 2, '"': 3, '%': 4, '&': 5, "'": 6, '(': 7, ')': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '=': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'Y': 50, 'Z': 51, 'a': 52, 'b': 53, 'c': 54, 'd': 55, 'e': 56, 'f': 57, 'g': 58, 'h': 59, 'i': 60, 'j': 61, 'k': 62, 'l': 63, 'm': 64, 'n': 65, 'o': 66, 'p': 67, 'q': 68, 'r': 69, 's': 70, 't': 71, 'u': 72, 'v': 73, 'w': 74, 'x': 75, 'y': 76, 'z': 77, '\xa0': 78, '®': 79, '–': 80, '—': 81, '’': 82, '“': 83, '”': 84, '™': 85}
{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '%', 5: '&', 6: "'", 7: '(', 8: ')', 9: ',', 10: '-', 11: '.', 12: '/', 13: '0', 14: '1', 15: '2', 16: '3', 17: '4', 18: '5', 19: '6', 20: '7', 21: '8', 22: '9', 23: ':', 24: ';', 25: '=', 26: '

In [12]:
encode = lambda s: [ stoi[c]          for c in s   ]

encode("bahh")


[53, 52, 59, 59]

In [13]:
decode = lambda l: ''.join(   itos[i] for i in l   )

decode([40, 39, 46, 46])



'NMTT'

In [14]:
data = torch.tensor(   encode(text), dtype=torch.long   )

print( data )



tensor([47, 44, 38,  ..., 25,  0,  0])


In [15]:
n          = int(   0.9*len(data)   )

train_data = data[:n]
val_data   = data[n:]


In [16]:
def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data

    ix = torch.randint(   len(data) - block_size, (batch_size,)   )

    x  = torch.stack(    [  data[   i : i+block_size ]     for i in ix ]    )
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix ]    )

    x, y = x.to(device), y.to(device)

    return x, y


In [17]:
temp_batch_size = 4
temp_block_size = 16

## select random starting points for the 4 sentences
ix = torch.randint(
            len(data) - block_size,
            (temp_batch_size,)
)

print( ix )



for index_temp in ix:
    print(  data[index_temp]  )


tensor([249289, 246457, 137314, 323667])
tensor(59)
tensor(70)
tensor(67)
tensor(71)


In [18]:
x  = torch.stack(
    [ data[   i : i+  temp_block_size ]   for i in ix ]

)

y  = torch.stack(
    [ data[ i+1 : i+1+ temp_block_size ]  for i in ix ]
)

print(x)
print(y)


tensor([[59, 66, 66, 70, 60, 65, 58,  1, 52, 65,  1, 52, 63, 71, 56, 69],
        [70, 72, 52, 63, 63, 76,  1, 67, 52, 60, 65, 57, 72, 63,  9,  1],
        [67, 56, 69, 63, 60, 67, 60, 55, 56, 64, 60, 52,  1, 60, 65,  1],
        [71, 56, 60, 65,  1, 12,  1, 63, 66, 74,  1, 54, 52, 69, 53, 66]])
tensor([[66, 66, 70, 60, 65, 58,  1, 52, 65,  1, 52, 63, 71, 56, 69, 65],
        [72, 52, 63, 63, 76,  1, 67, 52, 60, 65, 57, 72, 63,  9,  1, 53],
        [56, 69, 63, 60, 67, 60, 55, 56, 64, 60, 52,  1, 60, 65,  1, 30],
        [56, 60, 65,  1, 12,  1, 63, 66, 74,  1, 54, 52, 69, 53, 66, 59]])


In [19]:
@torch.no_grad()    ## for efficient processing
def estimate_loss():
    out = {}
    model.eval()   ## set to no training
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()  ## back to training
    return out


In [20]:
## NN Architectures



class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()

        self.key   = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.query = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.value = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]

        tril_def = torch.tril( torch.ones(block_size, block_size) )  ## [40, 40]

        self.register_buffer(
                  'tril',
                  tril_def
               )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        B, T, E = x.shape   ## [batch_size, 40, 512]

        k = self.key(   x )            ## k = (B, T, 64)
        q = self.query( x )            ## q = (B, T, 64)

        E2 = 64     ## I think this is 64 and not 512
        ## (B, T, E) @ (B, E, T)  -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * E2 ** -0.5

        wei = wei.masked_fill(
                      self.tril[:T, :T] == 0,
                      float('-inf')
        )

        ## (B, T, T)
        wei = F.softmax( wei, dim= -1 )         ## (B, T, T)
        wei = self.dropout(   wei   )

        ## perform weighted aggregation of values

        v   = self.value(  x  )   ## x = (B, 40, E)
        out = wei @ v             ## (B, T, T) @ (B, T, 64) -> (B, T, 64)

        return out


In [21]:
class FeedForward(nn.Module):

    def __init__(self, n_embd):         ## 512

        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),      ## [512, 4*512]
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),      ## [4*512, 512]
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


In [22]:
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):    ## (8, 64)
        super().__init__()
        self.heads = nn.ModuleList(  [ Head(head_size) for _ in range(num_heads) ] )
        self.proj  = nn.Linear(n_embd, n_embd)   ## 512, 512
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat(   [ h(x) for h in self.heads ], dim = -1   )
        out = self.proj(  out   )
        out = self.dropout(   out   )
        return out


In [23]:

class Block(nn.Module):

    def __init__(self, n_embd, n_head):     ## (512, 8)
        super().__init__()
        head_size = n_embd // n_head        ## 64
        self.sa   = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward( n_embd)    ## 512
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x


In [24]:
class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)   ## [65, 512]
        self.pos_emb_table = nn.Embedding(block_size, n_embd)     ## [block, 512]

        self.blocks = nn.Sequential(
                *[   Block(n_embd, n_head=n_head) for _ in range(n_layer)    ]
        )

        self.ln_f    = nn.LayerNorm(  n_embd    )
        self.lm_ffw_head = nn.Linear(n_embd, vocab_size)  ## [512, 65] # FFW Layer

    def forward(self, idx, targets=None):
        B, T = idx.shape     ## (Batch, 40)
        ## ids and targets are both (B, T) tensors of integers

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.pos_emb_table(torch.arange(T, device=device))

        x = tok_emb + pos_emb    ## [B, T, E] or [64, 40, 512]

        ## This is the architecture
        x = self.blocks(  x  )   ## (B, T, E)
        x = self.ln_f(    x  )   ## (B, T, E)   ## norm
        logits = self.lm_ffw_head(x)         ## [B, 40, 65]

        if targets is None:
            loss = None
        else:
            B, T, E  = logits.shape
            logits  = logits.view( B*T, E)
            targets = targets.view(B*T)
            loss    = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, idx_next), dim=1  )   ## (B, T+1) append sample to running sequence
        return idx


In [25]:
model   = GPTModel()

m       = model.to(device)

optimizer = torch.optim.Adam(  m.parameters(), lr=learning_rate   )



In [26]:
for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    ## eval the loss
    logits, loss = m(xb, yb)

    optimizer.zero_grad(set_to_none=True)   ## zero out
    loss.backward()
    optimizer.step()



step 0: train loss 4.5992, val loss 4.5921
step 500: train loss 1.4280, val loss 1.6760
step 1000: train loss 1.0395, val loss 1.4104
step 1500: train loss 0.7913, val loss 1.2344
step 2000: train loss 0.6152, val loss 1.0933
step 2500: train loss 0.4916, val loss 0.9459
step 3000: train loss 0.4162, val loss 0.8578
step 3500: train loss 0.3701, val loss 0.7806
step 4000: train loss 0.3349, val loss 0.7028
step 4500: train loss 0.3172, val loss 0.6872
step 5000: train loss 0.3007, val loss 0.6524
step 5500: train loss 0.2936, val loss 0.6496


In [27]:
import re

# Function to load the cleaned cat data from a file
def load_cleaned_data(filename):
    with open(filename, 'r') as file:
        data = file.readlines()
    return data

# Function to clean the input question by removing special characters
def clean_question(question):
    # Clean the question by removing special characters and converting to lowercase
    cleaned_question = re.sub(r'[^a-zA-Z0-9\s]', '', question).lower()
    return cleaned_question

# Function to extract main keywords from the question
def extract_keywords_from_question(question):
    # Remove common words like "what", "is", "tell me about", etc.
    keywords = re.sub(r'(what|is|tell me about|about|please|how|can|where|when|who)\s?', '', question, flags=re.IGNORECASE).strip()
    return keywords.lower()

# Function to search for the exact keyword in the text and return only relevant information
def search_answer(question, cleaned_data):
    keywords = extract_keywords_from_question(question)
    relevant_lines = []

    # Keep track of whether we found a relevant line
    found_match = False

    for line in cleaned_data:
        # If a line contains the keyword, add it to relevant_lines
        if keywords in line.lower():
            # Avoid duplication of the same lines
            if line.strip() not in relevant_lines:
                relevant_lines.append(line.strip())
            found_match = True

    # If relevant lines are found, return them as a single paragraph
    if relevant_lines:
        paragraph = ' '.join(relevant_lines)
        # Optional: If you want to remove any excess whitespace
        paragraph = ' '.join(paragraph.split())
        return paragraph
    else:
        return "Sorry, I couldn't find an answer to your question."


In [30]:
# Function to ask the user a question and return the corresponding answer
def ask_user_question():
    # Load the cleaned cat data from the file
    cleaned_data = load_cleaned_data('cats_and_dogs_data.txt')

    while True:
        # Ask the user for a question
        user_question = input("Ask me a question or type exit to quit : ")

        # Exit condition if the user types 'exit'
        if user_question.lower() == 'exit':
            print("Goodbye!")
            break

        # Process the question and search for the answer
        answer = search_answer(user_question, cleaned_data)

        # Display the \answer in a paragraph format
        print(f"Answer: {answer}")

# Start the Q&A session
if __name__ == "__main__":
    ask_user_question()



Ask me a question or type exit to quit : what is dust mites
Answer: Dust Mites: Minimizing Exposure in Dogs and Cats Dust mites feed off of skin scales and dander shed by humans and animals. Mites love bedding, carpeting, and anywhere they can find a hiding place with the likelihood of skin dander being present. Dust mites also require a relatively high humidity in the home to truly thrive.
Ask me a question or type exit to quit : what is lymphoma
Answer: Common Lymphoma Chemotherapy Medications for Cats and Dogs These drugs are typically used to treat lymphoma. Cutaneous Lymphoma in Cats Lymphoma Lymphoma in Cats Lymphoma accounts for one third of all cancers developed by cats. When the mediastinal lymph nodes become infiltrated with lymphoma, a mass is apparent in the mediastinum and a cancerous fluid fills the chest, restricting breathing. The fluid can be tapped but it will be back without treatment. Lymphoma in the Skin of Dogs There are three types of skin lymphoma: mycosis fungo