In [34]:
print("Hello")

Hello


In [35]:
import tensorflow as tf
from tensorflow.keras import layers 
import numpy as np
import os
import pickle

In [36]:
with open("data.txt","r",encoding='utf-8') as f:
    text_data=f.read()

In [37]:
text_data



We'll Follow the Architecture Shown in Attention is ALL you need paper

In [38]:
tokenizer=tf.keras.preprocessing.text.Tokenizer(num_words=10000,oov_token='<OOV>')
tokenizer.fit_on_texts([text_data])

In [39]:
sequence=tokenizer.texts_to_sequences([text_data])[0]

In [40]:
len(sequence)

22266

In [41]:
with open("tokenizer.pkl","wb") as f:
    pickle.dump(tokenizer,f)

In [42]:
def create_dataset(seq, window_size=64):
    input_seq, label_seq = [], []
    for i in range(len(seq) - window_size):
        input_seq.append(seq[i : i + window_size])
        label_seq.append(seq[i + 1 : i + window_size + 1])  # ✅ shifted by 1, same length
    return np.array(input_seq), np.array(label_seq)


In [43]:
X_data,y_data=create_dataset(sequence)

In [44]:
# max_seq_length=len(X_data)

In [45]:
# Embeddings have been made, next we have to positional encoding 
#dificult to understand but try and understand

class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        self.pos_encoding = tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]


In [46]:
# Now we'll try to write the decoder block cause we're building gpt replica which a decoder only model 
# we haven't used masked multi head attention
def transformer_block(embed_dim, num_heads, ff_dim, dropout=0.1):
    inputs = layers.Input(shape=(None, embed_dim))
    attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attn_output = layers.Dropout(dropout)(attn_output)
    out1 = layers.LayerNormalization(epsilon=1e-6)(inputs + attn_output)

    ffn = tf.keras.Sequential([
        layers.Dense(ff_dim, activation='relu'),
        layers.Dense(embed_dim),
    ])
    ffn_output = ffn(out1)
    ffn_output = layers.Dropout(dropout)(ffn_output)
    out2 = layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)
    return tf.keras.Model(inputs=inputs, outputs=out2)
    

In [47]:

max_seq_len = 64       # Reduced sequence length
batch_size  = 16       # Much smaller batch size
embed_dim   = 128      # Lower embedding dimension
num_heads   = 4        # Still a valid multi-head config
ff_dim      = 512      # Reasonable size for feed-forward
num_layers  = 4        # Fewer decoder layers
vocab_size  = 10000    # Keep this as is unless your vocab is small
def build_gpt_model():
    inputs = layers.Input(shape=(max_seq_len,))
    x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    x = PositionalEncoding(max_seq_len, embed_dim)(x)

    for _ in range(num_layers):
        x = transformer_block(embed_dim, num_heads, ff_dim)(x)

    outputs = layers.Dense(vocab_size, activation='softmax')(x)
    return tf.keras.Model(inputs, outputs)

In [48]:
model=build_gpt_model()
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [49]:
model.fit(X_data,y_data,batch_size=batch_size,epochs=10,validation_split=0.1)

Epoch 1/10
[1m1249/1249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m458s[0m 359ms/step - accuracy: 0.0388 - loss: 6.5663 - val_accuracy: 0.0331 - val_loss: 6.9485
Epoch 2/10
[1m1249/1249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m472s[0m 378ms/step - accuracy: 0.0394 - loss: 6.3834 - val_accuracy: 0.0331 - val_loss: 7.0487
Epoch 3/10
[1m1249/1249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m556s[0m 445ms/step - accuracy: 0.0393 - loss: 6.3800 - val_accuracy: 0.0331 - val_loss: 7.1295
Epoch 4/10
[1m1249/1249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m527s[0m 422ms/step - accuracy: 0.0391 - loss: 6.3772 - val_accuracy: 0.0331 - val_loss: 7.1837
Epoch 5/10
[1m1249/1249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m715s[0m 573ms/step - accuracy: 0.0391 - loss: 6.3775 - val_accuracy: 0.0331 - val_loss: 7.2389
Epoch 6/10
[1m1249/1249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m455s[0m 364ms/step - accuracy: 0.0392 - loss: 6.3759 - val_accuracy: 0.0331 - val_loss:

<keras.src.callbacks.history.History at 0x282b8e84320>

In [50]:
model.summary()