# word

In [1]:
def encode(text):
    return [ord(char) for char in text]

def decode(nums):
    return ''.join(chr(num) for num in nums)

text = "Abdul Rauf"
encoded = encode(text)
decoded = decode(encoded)

print('encoded: ', encoded)
print('decoded: ', decoded)


encoded:  [65, 98, 100, 117, 108, 32, 82, 97, 117, 102]
decoded:  Abdul Rauf


# sentence

In [2]:
def encode(text):
    return ' '.join(str(ord(char)) for char in text)

def decode(nums):
    return ''.join(chr(int(num)) for num in nums.split())


text = "Hi , Im rauf!"
encoded = encode(text)
decoded = decode(encoded)

print("encoded :", encoded)
print("decoded :", decoded)


encoded : 72 105 32 44 32 73 109 32 114 97 117 102 33
decoded : Hi , Im rauf!


In [5]:
def decode(nums):
    return ''.join(chr(int(num)) for num in nums.split())

nums = "82 97 117 102 32 105 115 32 97 110 32 65 73 32 97 110 100 32 77 76 32 101 110 103 105 110 101 101 114 32 119 104 111 32 115 116 117 100 105 101 115 32 105 110 32 81 85 69 83 84 32 85 110 105 118 101 114 115 105 116 121 46"
text = decode(nums)

print(text)

Rauf is an AI and ML engineer who studies in QUEST University.


# more like a transformer ones

In [10]:
import numpy as np

# tokenizer
class tokenizer:
    def __init__(self):
        self.token_to_id = {}
        self.id_to_token = {}
    
    def fit(self, text):
        tokens = set(text.split())
        for idx, token in enumerate(tokens):
            self.token_to_id[token] = idx
            self.id_to_token[idx] = token
    
    def tokenize(self, text):
        return [self.token_to_id[token] for token in text.split()]
    
    def detokenize(self, token_ids):
        return ' '.join(self.id_to_token[id] for id in token_ids)

# encoder
class encoder:
    def __init__(self, vocab_size, embed_dim=8):
        # an embedding matrix randomly initialized
        self.embeddings = np.random.randn(vocab_size, embed_dim)
    
    def encode(self, token_ids):
        return np.array([self.embeddings[id] for id in token_ids])

# decoder
class decoder:
    def __init__(self, tokenizer, encoder):
        self.tokenizer = tokenizer
        self.encoder = encoder
    
    def decode(self, embeddings):
        # closest token for each embedding
        token_ids = [np.argmin(np.linalg.norm(self.encoder.embeddings - embed, axis=1)) for embed in embeddings]
        return self.tokenizer.detokenize(token_ids)

text = "Im rauf and i like to listen music"
tokenizer = tokenizer()
tokenizer.fit(text)

token_ids = tokenizer.tokenize("i like music")

encoder = encoder(vocab_size=len(tokenizer.token_to_id))
embeddings = encoder.encode(token_ids)

decoder = decoder(tokenizer, encoder)
decoded_text = decoder.decode(embeddings)

print("Original text:", "i like music")
print("Token IDs:", token_ids)
print("Embeddings:", embeddings)
print("Decoded text:", decoded_text)

Original text: i like music
Token IDs: [3, 4, 7]
Embeddings: [[ 0.46548567  0.0170458   0.14591807 -1.1541701  -1.33036509 -2.13509428
  -1.82943882  0.17930458]
 [-1.58710845  0.56924933 -0.63067732 -1.65797882 -1.28127458  2.35046445
   0.59150691  0.97028405]
 [ 0.00248547 -0.40309693  1.29835778  0.39032915  0.48558573  0.19364147
   0.74993737  0.23244141]]
Decoded text: i like music
