In [1]:
with open('/Users/sanjayashastry/Downloads/interstellar_script.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [3]:
#cleaning the text

text = text.lower()
lines = text.splitlines()
cleaned_lines = [line.strip() for line in lines if line.strip() != '']
cleaned_text = ' '.join(cleaned_lines)

import string

allowed_chars = set(string.ascii_letters + string.digits + " ")
cleaned_text= ''.join(char for char in cleaned_text if char in allowed_chars)

In [4]:
cleaned_text[:200]

'interstellar written by jonathan nolan and christopher nolan transferred to pdf from interstellar  the complete screenplay with selected storyboards published november 2014 by faber  faber ltd uk for '

In [5]:
#tokenization

tokens= cleaned_text.split()

In [6]:
vocab= sorted(set(tokens))

In [7]:
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

In [8]:
vocab_size= len(vocab)

In [9]:
token_ids = np.array([word_to_idx[word] for word in tokens])

In [10]:
token_ids

array([1462, 3253,  367, ..., 1947,  607,  857])

In [11]:
len(token_ids),vocab_size

(24489, 3284)

In [12]:
context_size = 6

X = []
y = []

for i in range(context_size, len(token_ids)):
    context = token_ids[i - context_size:i]
    target = token_ids[i]
    X.append(context)
    y.append(target)

X = np.array(X)
y = np.array(y)


In [13]:
X[0],X[1]

(array([1462, 3253,  367, 1494, 1864,   89]),
 array([3253,  367, 1494, 1864,   89,  449]))

In [14]:
y[0]

np.int64(449)

In [15]:
#embedding

embedding_dim= 64
embedding_layer = layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    input_length=context_size,
    name="word_embedding")



In [16]:
X_tensor = tf.convert_to_tensor(X, dtype=tf.int32) 
embedded_output = embedding_layer(X_tensor)

In [17]:
X_tensor

<tf.Tensor: shape=(24483, 6), dtype=int32, numpy=
array([[1462, 3253,  367, 1494, 1864,   89],
       [3253,  367, 1494, 1864,   89,  449],
       [ 367, 1494, 1864,   89,  449, 1864],
       ...,
       [2999, 2940,  381,   89, 3132,  946],
       [2940,  381,   89, 3132,  946, 1947],
       [ 381,   89, 3132,  946, 1947,  607]], dtype=int32)>

In [18]:
embedded_output

<tf.Tensor: shape=(24483, 6, 64), dtype=float32, numpy=
array([[[-0.04231769, -0.00695965, -0.02362796, ...,  0.00782615,
          0.04626682,  0.016004  ],
        [-0.04226515, -0.02959578,  0.03503232, ..., -0.02892702,
         -0.04453003,  0.03919861],
        [-0.04262077,  0.02134294,  0.01993401, ..., -0.02101747,
         -0.03222983,  0.00358448],
        [ 0.02524828,  0.0066609 , -0.00214107, ..., -0.02450227,
         -0.01353761,  0.01561255],
        [-0.01723098,  0.02548292,  0.04533874, ..., -0.03356725,
         -0.04238709, -0.00065075],
        [-0.0350266 ,  0.01233538,  0.03191798, ..., -0.03715302,
         -0.01159427,  0.02017493]],

       [[-0.04226515, -0.02959578,  0.03503232, ..., -0.02892702,
         -0.04453003,  0.03919861],
        [-0.04262077,  0.02134294,  0.01993401, ..., -0.02101747,
         -0.03222983,  0.00358448],
        [ 0.02524828,  0.0066609 , -0.00214107, ..., -0.02450227,
         -0.01353761,  0.01561255],
        [-0.01723098,  0

In [19]:
embedding_layer.weights[0]

<Variable path=word_embedding/embeddings, shape=(3284, 64), dtype=float32, value=[[-0.04035167  0.00994016  0.04380003 ... -0.0307601  -0.04100013
  -0.0189991 ]
 [-0.00613187 -0.02368096 -0.0248044  ...  0.01691686  0.02221227
  -0.02468401]
 [ 0.04954371 -0.01578617 -0.03552318 ... -0.00460106 -0.01872165
   0.04332887]
 ...
 [ 0.01559821 -0.01572269  0.00327324 ...  0.02265764  0.02303386
   0.00509912]
 [ 0.00610654  0.01632574 -0.01482235 ... -0.01334463 -0.00787904
   0.0137468 ]
 [ 0.02193556 -0.02646337 -0.00402904 ... -0.01972905 -0.01474252
   0.04712917]]>

In [20]:

def get_positional_encoding(context_size, embedding_dim):
    pos_enc = np.zeros((context_size, embedding_dim))
    for pos in range(context_size):
        for i in range(embedding_dim):
            angle = pos / np.power(10000, (2 * (i // 2)) / embedding_dim)
            if i % 2 == 0:
                pos_enc[pos, i] = np.sin(angle)
            else:
                pos_enc[pos, i] = np.cos(angle)
    return tf.convert_to_tensor(pos_enc, dtype=tf.float32)  # shape: (context_size, embedding_dim)

# Generate the positional encoding matrix
positional_encoding = get_positional_encoding(context_size, embedding_dim)

# embedded_output shape: (batch_size, context_size, embedding_dim)
# positional_encoding shape: (context_size, embedding_dim)

# Add positional encoding to each sample in the batch
embedded_with_pos = embedded_output + positional_encoding  # broadcasting will work automatically

print("Final shape after adding positional encoding:", embedded_with_pos.shape)

Final shape after adding positional encoding: (24483, 6, 64)


In [21]:
embedded_with_pos

<tf.Tensor: shape=(24483, 6, 64), dtype=float32, numpy=
array([[[-0.04231769,  0.9930403 , -0.02362796, ...,  1.0078261 ,
          0.04626682,  1.016004  ],
        [ 0.7992058 ,  0.5107065 ,  0.7165937 , ...,  0.971073  ,
         -0.04439668,  1.0391986 ],
        [ 0.8666766 , -0.3948039 ,  1.017414  , ...,  0.97898245,
         -0.03196312,  1.0035844 ],
        [ 0.16636828, -0.9833316 ,  0.77613145, ...,  0.9754976 ,
         -0.01313755,  1.0156125 ],
        [-0.7740335 , -0.6281607 ,  0.18687765, ...,  0.9664325 ,
         -0.04185368,  0.9993491 ],
        [-0.9939509 ,  0.2959976 , -0.5392092 , ...,  0.9628466 ,
         -0.01092751,  1.0201747 ]],

       [[-0.04226515,  0.9704042 ,  0.03503232, ...,  0.971073  ,
         -0.04453003,  1.0391986 ],
        [ 0.7988502 ,  0.5616452 ,  0.70149535, ...,  0.9789825 ,
         -0.03209648,  1.0035845 ],
        [ 0.9345457 , -0.40948594,  0.9953389 , ...,  0.97549766,
         -0.0132709 ,  1.0156125 ],
        [ 0.12388903, -0

In [22]:
embedded_with_pos.shape[-1]

64

In [23]:
#transformer block

head_dim = embedding_dim  # For single head, often D = d_k = d_v

# Initialize trainable weights for Q, K, V
W_q = tf.Variable(tf.random.normal([embedding_dim, head_dim]), name='W_q')
W_k = tf.Variable(tf.random.normal([embedding_dim, head_dim]), name='W_k')
W_v = tf.Variable(tf.random.normal([embedding_dim, head_dim]), name='W_v')

In [None]:
def compute_attention(embedded_with_pos):
    # Shape: (batch, T, D)
    Q = tf.einsum('bij,jk->bik', embedded_with_pos, W_q)  
    K = tf.einsum('bij,jk->bik', embedded_with_pos, W_k)
    V = tf.einsum('bij,jk->bik', embedded_with_pos, W_v)
    
    return Q, K, V

In [None]:
def scaled_dot_product_attention(Q, K, V):
    d_k = tf.cast(tf.shape(K)[-1], tf.float32)

    # Attention scores: Q @ Kᵗ
    scores = tf.einsum('bij,bkj->bik', Q, K)  
    scores /= tf.math.sqrt(d_k)

    # Softmax across keys
    attention_weights = tf.nn.softmax(scores, axis=-1)  

    # Weighted sum of values
    output = tf.einsum('bij,bjk->bik', attention_weights, V) 
    
    return output, attention_weights

In [26]:
Q, K, V = compute_attention(embedded_with_pos)
attended_output, attn_weights = scaled_dot_product_attention(Q, K, V)

In [27]:
last_token_output = attended_output[:, -1, :]

In [None]:
output_layer = layers.Dense(vocab_size)
logits = output_layer(last_token_output) 

In [29]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [30]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [None]:
# Converting y to tensor
y_tensor = tf.convert_to_tensor(y, dtype=tf.int32)

# Training hyperparameters
epochs = 30
batch_size = 64

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    num_batches = int(np.ceil(len(X) / batch_size))
    
    for i in range(0, len(X), batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y_tensor[i:i+batch_size]

        with tf.GradientTape() as tape:
            X_tensor = tf.convert_to_tensor(X_batch, dtype=tf.int32)
            embedded_output = embedding_layer(X_tensor)
            embedded_with_pos = embedded_output + positional_encoding

            Q, K, V = compute_attention(embedded_with_pos)
            attended_output, _ = scaled_dot_product_attention(Q, K, V)
            last_token_output = attended_output[:, -1, :]
            logits = output_layer(last_token_output)

            loss = loss_fn(y_batch, logits)

        gradients = tape.gradient(loss, [embedding_layer.trainable_variables[0], 
                                         W_q, W_k, W_v, 
                                         output_layer.trainable_variables[0], 
                                         output_layer.trainable_variables[1]])
        
        optimizer.apply_gradients(zip(gradients, 
                                      [embedding_layer.trainable_variables[0], 
                                       W_q, W_k, W_v, 
                                       output_layer.trainable_variables[0], 
                                       output_layer.trainable_variables[1]]))

        epoch_loss += loss.numpy()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss/num_batches:.4f}")

Epoch 1/30 - Loss: 3.5952
Epoch 2/30 - Loss: 3.5134
Epoch 3/30 - Loss: 3.4514
Epoch 4/30 - Loss: 3.4036
Epoch 5/30 - Loss: 3.3661
Epoch 6/30 - Loss: 3.3350
Epoch 7/30 - Loss: 3.3112
Epoch 8/30 - Loss: 3.2893
Epoch 9/30 - Loss: 3.2721
Epoch 10/30 - Loss: 3.2576
Epoch 11/30 - Loss: 3.2455
Epoch 12/30 - Loss: 3.2341
Epoch 13/30 - Loss: 3.2240
Epoch 14/30 - Loss: 3.2159
Epoch 15/30 - Loss: 3.2070
Epoch 16/30 - Loss: 3.1990
Epoch 17/30 - Loss: 3.1915
Epoch 18/30 - Loss: 3.1852
Epoch 19/30 - Loss: 3.1796
Epoch 20/30 - Loss: 3.1731
Epoch 21/30 - Loss: 3.1695
Epoch 22/30 - Loss: 3.1619
Epoch 23/30 - Loss: 3.1560
Epoch 24/30 - Loss: 3.1528
Epoch 25/30 - Loss: 3.1472
Epoch 26/30 - Loss: 3.1433
Epoch 27/30 - Loss: 3.1387
Epoch 28/30 - Loss: 3.1358
Epoch 29/30 - Loss: 3.1325
Epoch 30/30 - Loss: 3.1322


In [58]:
def sample_from_logits(logits, temperature=0.7):
    logits = logits / temperature
    probs = tf.nn.softmax(logits).numpy().flatten()
    return np.random.choice(len(probs), p=probs)

def generate_text(seed_text, num_words=20, temperature=0.7):
    generated = seed_text.lower().split()
    for _ in range(num_words):
        context = generated[-context_size:]
        if len(context) < context_size:
            context = [''] * (context_size - len(context)) + context
        context_ids = [word_to_idx.get(w, 0) for w in context]
        X_input = tf.convert_to_tensor([context_ids], dtype=tf.int32)

        embedded_output = embedding_layer(X_input)
        embedded_with_pos = embedded_output + positional_encoding

        Q, K, V = compute_attention(embedded_with_pos)
        attended_output, _ = scaled_dot_product_attention(Q, K, V)
        last_token_output = attended_output[:, -1, :]
        logits = output_layer(last_token_output)

        next_id = sample_from_logits(logits, temperature)
        next_word = idx_to_word[next_id]
        generated.append(next_word)

    return ' '.join(generated)

In [59]:
print(generate_text(" Love is ", num_words=10))

love is there even worked out int murphs bedroom twilight murph ten
