In [6]:
!pip install -U pip transformers




In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline


In [9]:
checkpoint = 'facebook/nllb-200-distilled-600M'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
print(f"{len(tokenizer.vocab)}\n")

tokenizer.vocab

256204



{'·åà·â•·à®': 123130,
 'barira': 182628,
 '—ò—Å–∫–∞': 131404,
 'inale': 61636,
 '‚ñÅsebeletsa': 121166,
 '‡ΩÇ‡Ωì‡Ω¶‡ºã‡Ωö‡Ω¥‡Ω£‡ºã': 129161,
 '‚ñÅ‡¶Ü‡¶™': 4626,
 '‚ñÅ‡∞∏‡∞ø‡∞¶‡±ç‡∞ß‡∞Ç‡∞ó‡∞æ': 230025,
 '‚ñÅginawa': 48332,
 '‚ñÅ‡∞Æ‡∞ø‡∞ó‡∞ø‡∞≤‡∞ø': 177675,
 '‚ñÅ—Å–ø–æ—Ä—Ç': 38121,
 '‚ñÅKei': 23963,
 '·â≥·äï': 93179,
 '‡Ωì‡Ω¥‡ΩÇ‡ºã': 38600,
 '‚ñÅphuqhas': 241615,
 '‚ñÅenwegh·ªã': 59077,
 '‚ñÅgla': 13034,
 '‚ñÅ‡≤∏‡≤∞‡≤ï‡≤æ‡≤∞': 246214,
 'ekanga': 142897,
 '„Éà„É™„ÉÉ„ÇØ': 106551,
 'yendo': 78203,
 '≈°anu': 10719,
 '‚ñÅpodrobnosti': 220281,
 'cija': 5491,
 'ydd_Hebr': 256197,
 '‚ñÅ280': 127413,
 '‚ñÅ–ø—Ä–æ–ø–æ': 56665,
 'gov': 17582,
 '‚ñÅ◊î◊ô◊ô◊™': 93440,
 '—é–∫': 222672,
 '’°’Ø’°’∂': 4797,
 'n ã': 96871,
 '‚ñÅT√§m√§n': 89463,
 'bukas': 229760,
 '‚ñÅ’ß’Ω': 209295,
 '‚ñÅYa': 3147,
 'Ëë£': 255251,
 '‚ñÅ—Ç–∞—Ä—Ç—É—É': 156533,
 '‚ñÅtilfelle': 189577,
 '„Å´„Å™„Å£„Å¶„ÅÑ„Çã': 49314,
 'llll': 47036,
 '‚ñÅ·Éì·Éò·Éì': 131464,
 '‡ªâ‡∫ç': 214625,
 '‚ñÅyika': 77289,
 '‚ñÅverlaat': 157817,
 '‚ñÅ‡§Ø‡§É':

In [12]:
thai_char_min = 0x0E00
thai_char_max = 0x0E7F

thai_tokens = [
    token for token in tokenizer.vocab.keys()
    if any(thai_char_min <= ord(char) <= thai_char_max for char in token)
]

thai_token_count = len(thai_tokens)
sample_size = 20
thai_tokens_sample = thai_tokens[:sample_size]


print(f"{thai_token_count}\n")
for token in thai_tokens_sample:
  print(token)


1712

‚ñÅ‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ
‡∏Ñ‡∏ß‡∏≤‡∏°
‚ñÅ‡∏à‡∏≤‡∏Å
‚ñÅ‡∏£‡∏∞
‡∏≠‡∏á‡∏Ñ‡πå
‡∏ó‡∏ò
‡∏Ñ‡∏£‡∏≠‡∏á
‡∏ó‡∏µ‡πà
‡πÄ‡∏®
‡∏ï‡πà‡∏≤‡∏á
‡∏†‡∏≤‡∏û
‚ñÅ‡πÄ‡∏Ç‡πâ‡∏≤
‡∏ï‡πâ‡∏ô
‚ñÅ‡∏°‡∏±‡∏ô‡πÑ‡∏°‡πà
‡∏≠‡∏≠‡∏Å‡∏à‡∏≤‡∏Å
‡∏π‡πâ
‡∏û‡∏≤
‡∏ö‡∏£‡∏£‡∏î‡∏≤
‡πÇ‡∏•
‚ñÅ‡∏ü‡∏±‡∏á‡∏ô‡∏∞


In [13]:
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import math


In [14]:
sentence = 'Work hard, play harder'

In [15]:
cleaned_sentence = sentence.replace(',', '')
cleaned_sentence

'Work hard play harder'

In [16]:
words = cleaned_sentence.split()
words

['Work', 'hard', 'play', 'harder']

In [17]:
sorted_words = sorted(words)
sorted_words

['Work', 'hard', 'harder', 'play']

In [18]:
dc = {word: index for index, word in enumerate(sorted_words)}
dc

{'Work': 0, 'hard': 1, 'harder': 2, 'play': 3}

In [19]:
sentence_int = tf.constant(
    [dc[s] for s in sentence.replace(',', '').split()],
    dtype=tf.int32
)

In [20]:
print(sentence)
print(sentence_int)

Work hard, play harder
tf.Tensor([0 1 3 2], shape=(4,), dtype=int32)


In [21]:
# ‡∏™‡∏£‡πâ‡∏≤‡∏á embedding layer
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)

In [22]:
embedded_sentence = embed(sentence_int)

In [23]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.02556748, -0.04888377],
       [-0.01246854, -0.00029393],
       [-0.03982452, -0.02412304],
       [ 0.01625296, -0.00427993]], dtype=float32)>

In [24]:
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

dummy_input = tf.constant([0, 1, 2], dtype=tf.int32)

# Case 1 Default initializer (RandomUniform(-0.05, 0.05))
embed_default = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
_ = embed_default(dummy_input) # ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô layer ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏™‡∏£‡πâ‡∏≤‡∏á weights
weights_default = embed_default.get_weights()[0].flatten()
weights_default.shape

(100000,)

In [25]:
# Case 2 GlorotUniform initializer
tf.random.set_seed(123)
embed_glorot = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.GlorotUniform()
)
_ = embed_glorot(dummy_input) # ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô layer ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏™‡∏£‡πâ‡∏≤‡∏á weights
weights_glorot = embed_glorot.get_weights()[0].flatten()
weights_glorot.shape

(100000,)

In [26]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=weights_default, nbinsx=50, name="Default Uniform [-0.05, 0.05]", opacity=0.6))
fig.add_trace(go.Histogram(x=weights_glorot, nbinsx=50, name="Glorot Uniform", opacity=0.6))

fig.update_layout(
    title_text='Embedding Layer Initialization Comparison',
    xaxis_title_text='Weight values',
    yaxis_title_text='Frequency',
    barmode='overlay',
    legend_orientation="h",
    legend_yanchor="bottom",
    legend_y=1.02,
    legend_xanchor="right",
    legend_x=1
)

fig.show()

print("Default initializer range ", weights_default.min(), weights_default.max())
print("Glorot initializer range ", weights_glorot.min(), weights_glorot.max())

Default initializer range  -0.049999952 0.049998928
Glorot initializer range  -0.010954106 0.010953146


In [27]:
def glorot_uniform_limits(fan_in, fan_out):
    limit = math.sqrt(6.0 / (fan_in + fan_out))
    a, b = -limit, limit
    return a, b

# ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Embedding layer (vocab_size=50000, embedding_dim=2)
fan_in = 50000
fan_out = 2

a, b = glorot_uniform_limits(fan_in, fan_out)
print("Glorot Uniform a =", a)
print("Glorot Uniform b =", b)

Glorot Uniform a = -0.010954232067652772
Glorot Uniform b = 0.010954232067652772


In [28]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [29]:
token_embedding_layer = model.model.encoder.embed_tokens
token_embedding_layer.weight.shape

torch.Size([256206, 1024])

In [30]:
long_sentence = "In the vast realm of natural language processing, understanding the nuances of how models handle sequential data is crucial. Positional encoding plays a vital role in providing this essential information to the model, allowing it to differentiate between words at different positions in a sentence, which is fundamental for tasks like translation, summarization, and text generation."

In [31]:
tokens = tokenizer(long_sentence, return_tensors="pt")

print(tokens['input_ids'][0])

tensor([256047,    717,    349,  14430,  12284, 248070,    452,  25307,  65445,
        157278, 248079, 133930,    349,    713,  75831,    452,  11657, 141057,
         47274, 116914, 124785,   6067,    248, 182071, 248075,  12013,  58409,
         12025, 246156,   3054,    705,      9, 104781,  76065,    108, 174693,
          3423, 140515,  18781,    202,    349,  14916, 248079,  82935,     87,
           796,    202,  53054,    502,  25914,  51744,    230,  30158, 199073,
           108,      9, 109267, 248079,   9089,    248,  75529,    351, 226047,
          6399, 200356, 248079,   2493, 109207, 181953, 248079,    540,  35883,
        120531, 248075,      2])


In [32]:
len(tokens['input_ids'][0])

75

In [33]:
token_embedding_layer(tokens['input_ids'][0][0]).shape

torch.Size([1024])

In [34]:
token_embeddings = token_embedding_layer(tokens['input_ids'][0])

print("Token Embedding Matrix shape", token_embeddings.shape)
token_embeddings

Token Embedding Matrix shape torch.Size([75, 1024])


tensor([[-5.0000e+00, -1.2725e+00, -9.3604e-01,  ..., -1.8297e+01,
         -9.1328e+00, -1.0672e+01],
        [ 2.6416e-01,  2.6831e-01,  2.0117e-01,  ...,  3.2715e+00,
         -3.2402e+00,  3.1738e+00],
        [ 4.3579e-01, -2.3352e-01,  2.6825e-02,  ...,  5.4648e+00,
          2.7129e+00,  5.5430e+00],
        ...,
        [ 8.5859e+00, -4.5391e+00, -4.7314e-01,  ..., -7.9529e-02,
          7.4844e+00, -7.5156e+00],
        [-2.4863e+00, -2.7515e-01,  5.6114e-03,  ...,  1.0180e+01,
         -7.2422e+00, -4.8047e+00],
        [-7.8320e-01, -9.0527e-01, -9.4482e-01,  ...,  3.1078e+01,
         -8.1494e-01, -8.7354e-01]], grad_fn=<MulBackward0>)

In [35]:
import plotly.express as px

token_embeddings_np = token_embeddings.detach().numpy()

fig = px.imshow(
    token_embeddings_np,
    color_continuous_scale="RdBu",
    labels=dict(x="Embedding Dimension", y="Token Index", color="Value"),
    title="Token Embedding Heatmap"
)

fig.update_xaxes(side="top")
fig.update_layout(height=500, width=900)
fig.show()

In [36]:
d = embedded_sentence.shape[-1]
d

2

In [37]:
d_q, d_k, d_v = 2, 2, 4

d_q, d_k, d_v

(2, 2, 4)

In [38]:
tf.random.set_seed(123)
W_query = tf.Variable(tf.random.uniform((d, d_q)), trainable=True)
W_key   = tf.Variable(tf.random.uniform((d, d_k)), trainable=True)
W_value = tf.Variable(tf.random.uniform((d, d_v)), trainable=True)

In [39]:
print(W_query.shape, W_key.shape, W_value.shape)

(2, 2) (2, 2) (2, 4)


In [41]:
W_query

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.12615311, 0.5727513 ],
       [0.2993133 , 0.5461836 ]], dtype=float32)>

In [42]:
W_key

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.88968754, 0.12354946],
       [0.7718717 , 0.6850728 ]], dtype=float32)>

In [43]:
W_value

<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
array([[0.48962688, 0.5857923 , 0.36451697, 0.6550509 ],
       [0.9075084 , 0.37557673, 0.6882372 , 0.25384045]], dtype=float32)>

In [44]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.02556748, -0.04888377],
       [-0.01246854, -0.00029393],
       [-0.03982452, -0.02412304],
       [ 0.01625296, -0.00427993]], dtype=float32)>

In [45]:
queries = tf.matmul(embedded_sentence, W_query)
keys    = tf.matmul(embedded_sentence, W_key)
values  = tf.matmul(embedded_sentence, W_value)

In [46]:
print("Queries shape", queries.shape)
queries

Queries shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.01140615, -0.01205571],
       [-0.00166092, -0.00730191],
       [-0.01224433, -0.03598515],
       [ 0.00076932,  0.00697127]], dtype=float32)>

In [47]:
print("Keys shape", keys.shape)
keys

Keys shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.01498493, -0.0303301 ],
       [-0.01131998, -0.00174185],
       [-0.05405127, -0.02144633],
       [ 0.0111565 , -0.00092402]], dtype=float32)>

In [48]:
print("Values shape", values.shape)
values

Values shape (4, 4)


<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[-0.03184391, -0.00338237, -0.02432385,  0.00433932],
       [-0.00637168, -0.00741437, -0.00474729, -0.00824214],
       [-0.04139102, -0.03238895, -0.03111909, -0.03221049],
       [ 0.00407381,  0.00791342,  0.00297887,  0.0095601 ]],
      dtype=float32)>

In [50]:
omega = tf.matmul(queries, keys, transpose_b=True)

In [51]:


print("Omega shape", omega.shape)
print("Omega (Unnormalized attention weights)")
print(omega)

Omega shape (4, 4)
Omega (Unnormalized attention weights)
tf.Tensor(
[[ 5.36571024e-04  1.50116583e-04  8.75067431e-04 -1.16112911e-04]
 [ 2.46356532e-04  3.15204343e-05  2.46374286e-04 -1.17829586e-05]
 [ 1.27491355e-03  2.01286253e-04  1.43357145e-03 -1.03352824e-04]
 [-2.22967632e-04 -2.08515848e-05 -1.91091036e-04  2.14131433e-06]], shape=(4, 4), dtype=float32)


In [52]:
d_k = tf.cast(d_k, tf.float32)

scaled_omega = omega / tf.sqrt(d_k)

attention_weights = tf.nn.softmax(scaled_omega, axis=-1)

print("Attention Weights")
print(attention_weights)

Attention Weights
tf.Tensor(
[[0.25003096 0.24996264 0.2500908  0.2499156 ]
 [0.2500209  0.24998291 0.2500209  0.24997526]
 [0.25010136 0.24991155 0.2501294  0.24985772]
 [0.2499797  0.25001544 0.24998535 0.25001952]], shape=(4, 4), dtype=float32)


In [53]:
row_sums = tf.reduce_sum(attention_weights, axis=-1)

print("Sum of each row in attention_weights")
row_sums

Sum of each row in attention_weights


<tf.Tensor: shape=(4,), dtype=float32, numpy=array([1., 1., 1., 1.], dtype=float32)>

In [54]:
context_vector = tf.matmul(attention_weights, values)

print("Context Vector shape", context_vector.shape)
print(context_vector)

Context Vector shape (4, 4)
tf.Tensor(
[[-0.01888805 -0.00882151 -0.01430649 -0.00664159]
 [-0.01888472 -0.00881889 -0.01430399 -0.00663898]
 [-0.0188918  -0.00882307 -0.01430933 -0.00664266]
 [-0.01888197 -0.00881749 -0.0143019  -0.00663786]], shape=(4, 4), dtype=float32)


In [55]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v):
        super().__init__()
        self.d_out_kq = d_out_kq

        self.W_query = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_key = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_value = tf.Variable(
            tf.random.uniform((d_in, d_out_v)), trainable=True
        )

    def call(self, x):
        keys = tf.matmul(x, self.W_key)      # [T, d_out_kq]
        queries = tf.matmul(x, self.W_query) # [T, d_out_kq]
        values = tf.matmul(x, self.W_value)  # [T, d_out_v]

        # Attention scores: QK·µÄ
        attn_scores = tf.matmul(queries, keys, transpose_b=True)  # [T, T]

        # Softmax (scaled by sqrt(d_k))
        attn_weights = tf.nn.softmax(
            attn_scores / tf.math.sqrt(tf.cast(self.d_out_kq, tf.float32)), axis=-1
        )  # [T, T]

        # Weighted sum
        context_vec = tf.matmul(attn_weights, values)  # [T, d_out_v]
        return context_vec

In [56]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 4

sa = SelfAttention(d_in, d_out_kq, d_out_v)

out = sa(embedded_sentence)

print(out.shape)  # (T, d_out_v)
print(out.numpy())

(4, 4)
[[-0.01888805 -0.00882151 -0.01430649 -0.00664159]
 [-0.01888472 -0.00881889 -0.01430399 -0.00663898]
 [-0.0188918  -0.00882307 -0.01430933 -0.00664266]
 [-0.01888197 -0.00881749 -0.0143019  -0.00663786]]


In [57]:
class MultiHeadAttentionWrapper(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v, num_heads):
        super().__init__()
        self.heads = [
            SelfAttention(d_in, d_out_kq, d_out_v)
            for _ in range(num_heads)
        ]

    def call(self, x):
        # ‡∏£‡∏±‡∏ô‡∏ó‡∏∏‡∏Å head ‡πÅ‡∏•‡πâ‡∏ß concat ‡∏ï‡∏≤‡∏°‡πÅ‡∏Å‡∏ô‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
        head_outputs = [head(x) for head in self.heads]   # list of [T, d_out_v]
        return tf.concat(head_outputs, axis=-1)           # [T, num_heads * d_out_v]

In [58]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 1

sa = SelfAttention(d_in, d_out_kq, d_out_v)

# ‡∏ñ‡πâ‡∏≤ embedded_sentence.shape = [T, d_in] ‡πÄ‡∏ä‡πà‡∏ô [6, 3]
out = sa(embedded_sentence)

print(out.shape)   # (T, d_out_v) -> (6, 1)
print(out.numpy())

(4, 1)
[[-0.01264724]
 [-0.01264453]
 [-0.01264966]
 [-0.01264263]]


In [59]:
tf.random.set_seed(123)

block_size = embedded_sentence.shape[0]   # [T, d_in] ‚Üí T = sequence length

mha = MultiHeadAttentionWrapper(
    d_in, d_out_kq, d_out_v, num_heads=3
)

# run MHA
context_vecs = mha(embedded_sentence)   # [T, num_heads * d_out_v]

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tf.Tensor(
[[-0.01264724 -0.016843   -0.01975319]
 [-0.01264453 -0.01684262 -0.01976124]
 [-0.01264966 -0.0168496  -0.01977023]
 [-0.01264263 -0.01683888 -0.01975338]], shape=(4, 3), dtype=float32)
context_vecs.shape: (4, 3)
