In [24]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
import string

In [25]:
with open('/Users/sanjayashastry/Downloads/interstellar_script.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [26]:
text = text.lower()
lines = text.splitlines()
lines = [line.strip() for line in lines if line.strip()]
text = ' '.join(lines)

In [27]:
allowed = set(string.ascii_letters + string.digits + ' ')
text = ''.join(c for c in text if c in allowed)

In [28]:
# Tokenization
tokens = text.split()
vocab = sorted(set(tokens))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}
vocab_size = len(vocab)

In [29]:
token_ids = np.array([word_to_idx[word] for word in tokens])

In [30]:
# Prepare training data
context_size = 6
X, y = [], []
for i in range(context_size, len(token_ids)):
    X.append(token_ids[i - context_size:i])
    y.append(token_ids[i])
X = np.array(X)
y = np.array(y)

In [31]:
# Positional encoding
def get_positional_encoding(x, y):
    encoding = np.zeros((x, y))
    for i in range(x):
        for j in range(y):
            angle = i / np.power(10000, (2 * (j // 2)) / y)
            encoding[i, j] = np.sin(angle) if j % 2 == 0 else np.cos(angle)
    return tf.convert_to_tensor(encoding, dtype=tf.float32)

#basically, we would want x= context sixe and y= embed dim, hence the output is a 6 by 64 matrix with each row representing the pos enc for eacg word in each element of X (uff thats a lot)

In [32]:
get_positional_encoding(6,64)

<tf.Tensor: shape=(6, 64), dtype=float32, numpy=
array([[ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00,
       

In [33]:
# Model
class MiniLLM(tf.keras.Model):
    def __init__(self, vocab_size, context_size, embed_dim):
        super().__init__()
        self.embed = layers.Embedding(vocab_size, embed_dim)  #we are embedding (into dim 64)
        self.W_q = self.add_weight(shape=(embed_dim, embed_dim), initializer='random_normal', trainable=True)
        self.W_k = self.add_weight(shape=(embed_dim, embed_dim), initializer='random_normal', trainable=True)
        self.W_v = self.add_weight(shape=(embed_dim, embed_dim), initializer='random_normal', trainable=True)
        self.out = layers.Dense(vocab_size)
        self.pos_enc = get_positional_encoding(context_size, embed_dim)
        self.context_size = context_size
        self.embed_dim = embed_dim

    def build(self, input_shape):
        self.out.build((None, self.context_size * self.embed_dim))
        super().build(input_shape)

    def call(self, x):
        x = self.embed(x) + self.pos_enc  # (batch, context, dim)
        Q = tf.matmul(x, self.W_q)
        K = tf.matmul(x, self.W_k)
        V = tf.matmul(x, self.W_v)
        scale = tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
        scores = tf.matmul(Q, K, transpose_b=True) / scale
        weights = tf.nn.softmax(scores, axis=-1)
        output = tf.matmul(weights, V)
        flat = tf.reshape(output, (tf.shape(x)[0], -1))
        return self.out(flat)

In [34]:
# Training
embed_dim = 64
model = MiniLLM(vocab_size, context_size, embed_dim)
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model.fit(X, y, epochs=20, batch_size=64)

Epoch 1/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.0474 - loss: 6.8021
Epoch 2/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.0669 - loss: 6.0769
Epoch 3/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.0987 - loss: 5.7438
Epoch 4/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.1318 - loss: 5.3671
Epoch 5/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1525 - loss: 5.0687
Epoch 6/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1786 - loss: 4.7781
Epoch 7/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.2019 - loss: 4.5466
Epoch 8/20
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.2172 - loss: 4.2674
Epoch 9/20
[1m383/383[0m [32m

<keras.src.callbacks.history.History at 0x13ea3b820>

In [37]:
# Text generation
def generate(model, prompt, num_words, temperature=0.7):
    words = prompt.lower().split()
    result = words[:]
    for i in range(num_words):
        context = [word_to_idx.get(w, 0) for w in result[-context_size:]]
        if len(context) < context_size:
            context = [0] * (context_size - len(context)) + context
        input_tensor = np.array(context).reshape(1, -1)
        logits = model(input_tensor)[0] / temperature
        probs = tf.nn.softmax(logits).numpy()
        next_id = np.random.choice(len(probs), p=probs)
        result.append(idx_to_word[next_id])
    return ' '.join(result)

In [40]:
print(generate(model, "Love is ", num_words=10, temperature=0.7))

love is about people to find on the ship hits a hot
