# Imports and Setup

In [24]:
import re, numpy as np, tensorflow as tf, matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")


# Category I – War and Peace

## Load & Preprocess Text

In [25]:
url_wp = "https://cs.stanford.edu/people/karpathy/char-rnn/warpeace_input.txt"
text_wp = tf.keras.utils.get_file("warpeace.txt", url_wp)
with open(text_wp, 'r', encoding='utf-8') as f:
    wp_text = f.read().lower()
wp_text = re.sub('[^a-zA-Z0-9 \.]', ' ', wp_text)
wp_tokens = wp_text.split()


Downloading data from https://cs.stanford.edu/people/karpathy/char-rnn/warpeace_input.txt
[1m3258246/3258246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


## Vocabulary and Token Sequences

In [26]:
vocab_wp = sorted(set(wp_tokens))
stoi_wp = {w:i for i,w in enumerate(vocab_wp)}
itos_wp = {i:w for w,i in stoi_wp.items()}
print("Vocab size:", len(vocab_wp))

from collections import Counter
c_wp = Counter(wp_tokens)
print("Top 10:", c_wp.most_common(10))
print("Bottom 10:", list(c_wp.items())[-10:])

def tokens_to_sequences(tokens, stoi, context=5, max_pairs=None):
    X,y=[],[]
    for i in range(context,len(tokens)):
        ctx = [stoi.get(w,0) for w in tokens[i-context:i]]
        X.append(ctx); y.append(stoi.get(tokens[i],0))
        if max_pairs and len(X)>=max_pairs: break
    return np.array(X), np.array(y)

CONTEXT = 5
X_wp, y_wp = tokens_to_sequences(wp_tokens, stoi_wp, context=CONTEXT)


Vocab size: 22982
Top 10: [('the', 34544), ('and', 22221), ('to', 16637), ('of', 14870), ('a', 10541), ('he', 9885), ('in', 8920), ('that', 8115), ('his', 7973), ('was', 7323)]
Bottom 10: [('firmament', 1), ('joshua', 1), ('nun.', 1), ('defenders', 2), ('conceptions.', 1), ('uninvited', 1), ('strengthens', 1), ('erected.', 1), ('immovability', 1), ('unreal', 1)]


## Train/Validation Split

In [27]:
X_wp_train, X_wp_val, y_wp_train, y_wp_val = train_test_split(X_wp, y_wp, test_size=0.1, random_state=42)


## Model Builder

In [28]:
def build_mlp(vocab_size, context_len, embed_dim, hidden_layers, hidden_units, activation):
    i = tf.keras.Input(shape=(context_len,))
    x = tf.keras.layers.Embedding(vocab_size, embed_dim)(i)
    x = tf.keras.layers.Flatten()(x)
    for _ in range(hidden_layers):
        x = tf.keras.layers.Dense(hidden_units, activation=activation)(x)
    o = tf.keras.layers.Dense(vocab_size, activation='softmax', dtype='float32')(x)
    m = tf.keras.Model(i,o)
    m.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
    return m

def prepare_tfds(X,y):
    ds = tf.data.Dataset.from_tensor_slices((X,y))
    return ds.shuffle(10000).batch(512).prefetch(tf.data.AUTOTUNE)


## Train Models

In [None]:
EPOCHS = 200
variants_wp = [
    {"embed_dim":32,"hidden_layers":1,"hidden_units":1024,"activation":"relu"},
    {"embed_dim":64,"hidden_layers":1,"hidden_units":1024,"activation":"relu"},
    {"embed_dim":64,"hidden_layers":2,"hidden_units":1024,"activation":"tanh"},
]

def train_variants_wp(tag,Xt,yt,Xv,yv,vocab):
    ds_t,ds_v=prepare_tfds(Xt,yt),prepare_tfds(Xv,yv)
    res={}
    for v in variants_wp:
        name=f"{tag}_emb{v['embed_dim']}_h{v['hidden_layers']}_{v['activation']}"
        m=build_mlp(len(vocab),CONTEXT,**v)
        ck=f"/kaggle/working/{name}.h5"
        cb=[tf.keras.callbacks.ModelCheckpoint(ck,save_best_only=True,monitor='val_loss'),
            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=6)]
        h=m.fit(ds_t,validation_data=ds_v,epochs=EPOCHS,verbose=1,callbacks=cb)
        m=tf.keras.models.load_model(ck)
        vl,va=m.evaluate(ds_v,verbose=0)
        res[name]={"model":m,"history":h.history,"val_loss":vl,"val_acc":va}
    return res

res_wp=train_variants_wp("WARPEACE",X_wp_train,y_wp_train,X_wp_val,y_wp_val,vocab_wp)


Epoch 1/200
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 19ms/step - loss: 10.0426 - sparse_categorical_accuracy: 2.6470e-05 - val_loss: 10.0426 - val_sparse_categorical_accuracy: 5.2431e-05 - learning_rate: 0.0010
Epoch 2/200
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - loss: 10.0426 - sparse_categorical_accuracy: 2.6623e-05 - val_loss: 10.0426 - val_sparse_categorical_accuracy: 5.2431e-05 - learning_rate: 0.0010
Epoch 3/200
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - loss: 10.0426 - sparse_categorical_accuracy: 2.7264e-05 - val_loss: 10.0426 - val_sparse_categorical_accuracy: 5.2431e-05 - learning_rate: 0.0010
Epoch 4/200
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - loss: 10.0426 - sparse_categorical_accuracy: 2.6385e-05 - val_loss: 10.0426 - val_sparse_categorical_accuracy: 5.2431e-05 - learning_rate: 0.0010
Epoch 5/200
[1m1006/1006[0m [32m━━━━━━━━━

## Plot Curves

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
for n,r in res_wp.items():
    plt.plot(r["history"]["loss"]); plt.plot(r["history"]["val_loss"],'--')
plt.title("War & Peace: Loss"); plt.xlabel("Epochs")
plt.subplot(1,2,2)
for n,r in res_wp.items():
    plt.plot([v*100 for v in r["history"]["val_sparse_categorical_accuracy"]])
plt.title("War & Peace: Val Accuracy (%)"); plt.xlabel("Epochs")
plt.show()


## t-SNE + Sample

In [None]:
best_wp=min(res_wp.items(),key=lambda x:x[1]['val_loss'])[1]['model']
emb=best_wp.layers[1].get_weights()[0]
idx=np.arange(min(200,len(vocab_wp)))
emb2=TSNE(n_components=2,perplexity=30).fit_transform(emb[idx])
plt.figure(figsize=(10,8))
plt.scatter(emb2[:,0],emb2[:,1],c=np.random.rand(len(idx),3))
for i,w in enumerate(list(vocab_wp)[:len(idx)]): plt.text(emb2[i,0],emb2[i,1],w,fontsize=8)
plt.title("t-SNE War & Peace"); plt.show()

def gen_text(model,stoi,itos,seed,n=30,temp=0.8):
    ctx=seed.split()
    for _ in range(n):
        x=np.array([[stoi.get(w,0) for w in ctx[-CONTEXT:]]])
        p=model.predict(x,verbose=0)[0]
        p=np.log(p+1e-9)/temp; p=np.exp(p)/np.sum(np.exp(p))
        nxt=np.random.choice(len(p),p=p); ctx.append(itos[nxt])
    return " ".join(ctx)

print(gen_text(best_wp,stoi_wp,itos_wp,"the prince said",30,0.8))


# Category II – Linux Code

## Load & Preprocess

In [None]:
url_lx = "https://cs.stanford.edu/people/karpathy/char-rnn/linux_input.txt"
text_lx = tf.keras.utils.get_file("linux.txt", url_lx)
with open(text_lx,'r',encoding='utf-8') as f: lx_text=f.read().lower()
lx_tokens = re.findall(r"[a-zA-Z0-9_]+|[{}();,<>!=+\-*/#]|<NL>", lx_text.replace('\n',' <NL> '))


## Vocabulary + Sequences

In [None]:
vocab_lx = sorted(set(lx_tokens))
stoi_lx = {w:i for i,w in enumerate(vocab_lx)}
itos_lx = {i:w for w,i in stoi_lx.items()}
print("Vocab size:", len(vocab_lx))
c_lx = Counter(lx_tokens)
print("Top 10:", c_lx.most_common(10))
print("Bottom 10:", list(c_lx.items())[-10:])

X_lx, y_lx = tokens_to_sequences(lx_tokens, stoi_lx, context=CONTEXT)
X_lx_train, X_lx_val, y_lx_train, y_lx_val = train_test_split(X_lx, y_lx, test_size=0.1, random_state=42)


## Train Models

In [None]:
variants_lx = [
    {"embed_dim":32,"hidden_layers":1,"hidden_units":1024,"activation":"relu"},
    {"embed_dim":64,"hidden_layers":1,"hidden_units":1024,"activation":"relu"},
    {"embed_dim":64,"hidden_layers":2,"hidden_units":1024,"activation":"tanh"},
]

def train_variants_lx(tag,Xt,yt,Xv,yv,vocab):
    ds_t,ds_v=prepare_tfds(Xt,yt),prepare_tfds(Xv,yv)
    res={}
    for v in variants_lx:
        name=f"{tag}_emb{v['embed_dim']}_h{v['hidden_layers']}_{v['activation']}"
        m=build_mlp(len(vocab),CONTEXT,**v)
        ck=f"/kaggle/working/{name}.h5"
        cb=[tf.keras.callbacks.ModelCheckpoint(ck,save_best_only=True,monitor='val_loss'),
            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=6)]
        h=m.fit(ds_t,validation_data=ds_v,epochs=EPOCHS,verbose=1,callbacks=cb)
        m=tf.keras.models.load_model(ck)
        vl,va=m.evaluate(ds_v,verbose=0)
        res[name]={"model":m,"history":h.history,"val_loss":vl,"val_acc":va}
    return res

res_lx=train_variants_lx("LINUX",X_lx_train,y_lx_train,X_lx_val,y_lx_val,vocab_lx)


## Plot Curves

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
for n,r in res_lx.items():
    plt.plot(r["history"]["loss"]); plt.plot(r["history"]["val_loss"],'--')
plt.title("Linux Code: Loss"); plt.xlabel("Epochs")
plt.subplot(1,2,2)
for n,r in res_lx.items():
    plt.plot([v*100 for v in r["history"]["val_sparse_categorical_accuracy"]])
plt.title("Linux Code: Val Accuracy (%)"); plt.xlabel("Epochs")
plt.show()


## t-SNE + Sample

In [None]:
best_lx=min(res_lx.items(),key=lambda x:x[1]['val_loss'])[1]['model']
emb=best_lx.layers[1].get_weights()[0]
idx=np.arange(min(200,len(vocab_lx)))
emb2=TSNE(n_components=2,perplexity=30).fit_transform(emb[idx])
plt.figure(figsize=(10,8))
plt.scatter(emb2[:,0],emb2[:,1],c=np.random.rand(len(idx),3))
for i,w in enumerate(list(vocab_lx)[:len(idx)]): plt.text(emb2[i,0],emb2[i,1],w,fontsize=8)
plt.title("t-SNE Linux Code"); plt.show()

print(gen_text(best_lx,stoi_lx,itos_lx,"int main (",40,0.7))
