In [31]:
import pandas as pd
import re
import random
import numpy as np

In [6]:
df = pd.read_csv("PoetryFoundationData.csv")

In [82]:
len(df["Title"])

13854

In [9]:
def clean_text(text):
    text = text.lower()
    # Keep letters, space, and select punctuation
    text = re.sub(r"[^a-z\s.,!?;:'\"-]", '', text)
    # Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [14]:
df['cleaned_title'] = df['Title'].apply(clean_text)


In [114]:
#new changes
titles = df['cleaned_title'].tolist()
processed_titles = titles.copy()


In [115]:
#all_text = '\n'.join(df['cleaned_title'].tolist()) + '\n'#old
all_text = '\n'.join(processed_titles) + '\n'#new
chars = sorted(set(all_text))
vocab_size = len(chars)

char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}



In [116]:
#encoding the text
def encode_text(text):
    return [char_to_idx[ch] for ch in text]

In [117]:
encoded_titles = [encode_text(title) for title in processed_titles]


In [118]:
def create_sequences(data_list):
    inputs = []
    targets = []
    for seq in data_list:
        if len(seq) < 2:
            continue
        inputs.append(seq[:-1])  # all except last character
        targets.append(seq[1:])  # all except first character
    return inputs, targets


In [119]:
X,Y = create_sequences(encoded_titles)
print(f"Number of sequences: {len(X)}")

Number of sequences: 13823


In [120]:
def batch_generator(X, Y, batch_size, pad_token=0):
    n = len(X)
    indices = list(range(n))
    while True:
        random.shuffle(indices)
        for start in range(0, n, batch_size):
            batch_idx = indices[start:start + batch_size]
            x_batch = [X[i] for i in batch_idx]
            y_batch = [Y[i] for i in batch_idx]

            # Find max length in this batch for padding
            max_len = max(len(seq) for seq in x_batch)

            # Pad sequences with pad_token (typically 0)
            x_batch_padded = [seq + [pad_token] * (max_len - len(seq)) for seq in x_batch]
            y_batch_padded = [seq + [pad_token] * (max_len - len(seq)) for seq in y_batch]

            yield x_batch_padded, y_batch_padded


In [121]:
class RNN:
    def __init__(self, vocab_size,hidden_size=512):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        

        self.Wxh = np.random.randn(hidden_size, vocab_size) * np.sqrt(2.0/vocab_size)
        self.Whh = np.random.randn(hidden_size, hidden_size) * np.sqrt(2.0/hidden_size) 
        self.Why = np.random.randn(vocab_size, hidden_size) * np.sqrt(2.0/hidden_size)

        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((vocab_size, 1))

        
        self.h_prev = np.zeros((hidden_size, 1))
        
    def one_hot_encode(self, idx):
        """Convert index to one-hot encoded vector."""
        x = np.zeros((self.vocab_size, 1))
        x[idx] = 1
        return x
    
    def forward(self, inputs):
        #at each time step, we need four of these dictionaries
        xs,hs,ys,ps = {}, {}, {}, {}
        hs[-1] = np.copy(self.h_prev)
        for t in range(len(inputs)):
            xs[t] = self.one_hot_encode(inputs[t])
            hs[t] = np.tanh(np.dot(self.Wxh,xs[t]) + np.dot(self.Whh,hs[t-1])+self.bh)
            ys[t] = np.dot(self.Why, hs[t]) + self.by
            e_y = np.exp(ys[t] - np.max(ys[t]))
            ps[t] = e_y / np.sum(e_y)

            

        self.last_cache = (xs, hs, ys, ps)
        self.h_prev = hs[len(inputs) - 1]

        return xs, hs, ys, ps

    def sample(self, seed_idx, length, temperature=0.8):
    # Add temperature to control randomness
        x = self.one_hot_encode(seed_idx)
        h = np.copy(self.h_prev)
        output = []
        for _ in range(length):
            h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
            y = np.dot(self.Why, h) + self.by
            y = y / temperature  # Apply temperature
            e_y = np.exp(y - np.max(y))
            p = e_y / np.sum(e_y)
            idx = np.random.choice(range(self.vocab_size), p=p.ravel())
            output.append(idx)
            x = self.one_hot_encode(idx)
        return output
    def lossandgrads(self, targets, pad_token=0):
        xs, hs, ys, ps = self.last_cache
        loss = 0.0
        dWxh, dWhh, dWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        dbh, dby = np.zeros_like(self.bh), np.zeros_like(self.by)
        dh_next = np.zeros_like(hs[0])

        valid_timesteps = 0  # count non-padding tokens

        for t in reversed(range(len(xs))):
            if targets[t] == pad_token:
                continue  # skip padding tokens

            valid_timesteps += 1
            loss += -np.log(ps[t][targets[t], 0] + 1e-9)  # add epsilon to avoid log(0)

            dy = np.copy(ps[t])
            dy[targets[t]] -= 1

            # Gradients
            dWhy += np.dot(dy, hs[t].T)
            dby += dy
            dh = np.dot(self.Why.T, dy) + dh_next
            dh_raw = (1 - hs[t] * hs[t]) * dh
            dbh += dh_raw
            dWxh += np.dot(dh_raw, xs[t].T)
            dWhh += np.dot(dh_raw, hs[t - 1].T)
            dh_next = np.dot(self.Whh.T, dh_raw)

        # Clip gradients
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -1, 1, out=dparam)

        self.grads = (dWxh, dWhh, dWhy, dbh, dby)

        # Normalize loss by valid (non-padding) timesteps
        return loss / (valid_timesteps if valid_timesteps > 0 else 1)

    def update_params(self, learning_rate=1e-3):
        dWxh, dWhh, dWhy, dbh, dby = self.grads
        for param, dparam in zip([self.Wxh, self.Whh, self.Why, self.bh, self.by],[dWxh, dWhh, dWhy, dbh, dby]):
            param -= learning_rate * dparam
    
    def train_step(self, inputs, targets, learning_rate=1e-3):
        
        self.forward(inputs)
        loss = self.lossandgrads(targets)
        self.update_params(learning_rate)
        return loss


In [122]:
model = RNN(vocab_size,hidden_size=1024)

In [123]:
seed_char = "t"
seed_idx = char_to_idx[seed_char]
sample_length = 30
sampled_indices = model.sample(seed_idx, sample_length)
sampled_text = ''.join(idx_to_char[idx] for idx in sampled_indices)
print(f"Sampled text starting with '{seed_char}': {sampled_text}")

Sampled text starting with 't': ''wen,xy
c eczumzw.-q;"px;rr:.


In [124]:
batch_gen = batch_generator(X, Y, 32)
for epoch in range(10000):
    x_batch, y_batch = next(batch_gen)
    total_loss = 0

    for x, y in zip(x_batch, y_batch):
        model.h_prev = np.zeros((model.hidden_size, 1))
        loss = model.train_step(x, y, learning_rate=1e-5)
        total_loss += loss

    if epoch % 10 == 0:  # Check more frequently
        avg_loss = total_loss / len(x_batch)
        print(f"Epoch {epoch}, Loss: {avg_loss:.6f}")  # More precision
        
        # Sample with different seeds
        for seed_char in ['<', 't', 'a']:
            if seed_char in char_to_idx:
                seed_idx = char_to_idx[seed_char]
                sample_idxs = model.sample(seed_idx, 50, temperature=0.5)
                generated = ''.join(idx_to_char[i] for i in sample_idxs)
                print(f"Seed '{seed_char}': {generated}")


Epoch 0, Loss: 3.773973
Seed 't': tbdra'
trv'fbvs"ts"' d"yb katpskitua:.oqdkv,nlw?z!
Seed 'a': zbhg:yxttdna
r,-;uuyrcrygw!vmghbrde'l-lwr.ro?o qtg
Epoch 10, Loss: 3.789886
Seed 't': 't-ttem'azkm at;upriimqczdtt'.lbn d ru .  'wh!odve
Seed 'a': byzmpttz;ut'nw
rtmtmaaazyhvtiuma"wqfvnr"x kn
!gl!w
Epoch 20, Loss: 3.652901
Seed 't': g-etgmsdpv';?aroxo .;q?? ej.ltct:iiyavox q.ccj is?
Seed 'a': "bcth.-r eam t""tj 
jbqkr?ywgwqzlr!a !no!s?o f.!mi
Epoch 30, Loss: 3.511940
Seed 't': mr"or j.eie!as  a  v.  e  io ee  ?s ehp  e?s s  ke
Seed 'a': zrmzb o,kd r  oe jo'e sie eoe m
  b e he          
Epoch 40, Loss: 3.394418
Seed 't':  m tr  e e e oshs se  a   eeebs          e e   e t
Seed 'a':   e  auo e lu a erateshti e sst    ae eanba  mhqoo
Epoch 50, Loss: 3.244720
Seed 't': ebo iurtiie eern tie,ie oeos e eera seh  seatcs or
Seed 'a': beete m bee e o s  n  enltlm mi  e aoet  rse  itcl
Epoch 60, Loss: 3.163949
Seed 't':  o e  to eioed hro  soeoaree cod on rotie  ir  ie 
Seed 'a': iiono  inern er  iaer t

KeyboardInterrupt: 

In [None]:
df["cleaned_title"].head(20)

0       objects used to prop open a window
1                           the new church
2                              look for me
3                                wild life
4                                 umbrella
5                                   sunday
6                           invisible fish
7             dont bother the earth spirit
8      the one thing that can save america
9     "hour in which i consider hydrangea"
10                                   stung
11                     nothing but good...
12                               how quiet
13                               porcupine
14                           summer apples
15               visiting the neighborhood
16                                   scars
17                        what remains two
18                          west of myself
19                                     yes
Name: cleaned_title, dtype: object

In [74]:
import numpy as np
import random

class ImprovedRNN:
    def __init__(self, vocab_size, seq_length, hidden_size=128):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        # Better weight initialization (Xavier/Glorot)
        self.Wxh = np.random.randn(hidden_size, vocab_size) * np.sqrt(2.0 / vocab_size)
        self.Whh = np.random.randn(hidden_size, hidden_size) * np.sqrt(2.0 / hidden_size)
        self.Why = np.random.randn(vocab_size, hidden_size) * np.sqrt(2.0 / hidden_size)
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((vocab_size, 1))
        self.h_prev = np.zeros((hidden_size, 1))
        
        # For Adam optimizer
        self.mWxh, self.mWhh, self.mWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        self.mbh, self.mby = np.zeros_like(self.bh), np.zeros_like(self.by)
        self.vWxh, self.vWhh, self.vWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        self.vbh, self.vby = np.zeros_like(self.bh), np.zeros_like(self.by)

    def one_hot_encode(self, idx):
        x = np.zeros((self.vocab_size, 1))
        x[idx] = 1
        return x

    def forward(self, inputs):
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = np.copy(self.h_prev)
        
        for t in range(len(inputs)):
            xs[t] = self.one_hot_encode(inputs[t])
            hs[t] = np.tanh(np.dot(self.Wxh, xs[t]) + np.dot(self.Whh, hs[t-1]) + self.bh)
            ys[t] = np.dot(self.Why, hs[t]) + self.by
            ps[t] = self.softmax(ys[t])
        
        self.last_cache = (xs, hs, ys, ps)
        self.h_prev = hs[len(inputs) - 1]
        return xs, hs, ys, ps
    
    def softmax(self, x):
        # Numerical stability
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x)

    def sample(self, seed_idx, length, temperature=1.0):
        """Sample with temperature control"""
        x = self.one_hot_encode(seed_idx)
        h = np.copy(self.h_prev)
        output = []
        
        for _ in range(length):
            h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
            y = np.dot(self.Why, h) + self.by
            
            # Apply temperature
            y = y / temperature
            p = self.softmax(y)
            
            idx = np.random.choice(range(self.vocab_size), p=p.ravel())
            output.append(idx)
            x = self.one_hot_encode(idx)
        
        return output

    def lossandgrads(self, targets):
        xs, hs, ys, ps = self.last_cache
        loss = 0.0
        
        dWxh, dWhh, dWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        dbh, dby = np.zeros_like(self.bh), np.zeros_like(self.by)
        dh_next = np.zeros_like(hs[0])
        
        for t in reversed(range(len(xs))):
            loss += -np.log(ps[t][targets[t], 0] + 1e-8)  # Add small epsilon for numerical stability
            dy = np.copy(ps[t])
            dy[targets[t]] -= 1
            
            dWhy += np.dot(dy, hs[t].T)
            dby += dy
            
            dh = np.dot(self.Why.T, dy) + dh_next
            dh_raw = (1 - hs[t] * hs[t]) * dh
            dbh += dh_raw
            
            dWxh += np.dot(dh_raw, xs[t].T)
            dWhh += np.dot(dh_raw, hs[t - 1].T)
            dh_next = np.dot(self.Whh.T, dh_raw)
        
        # Gradient clipping
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam)
        
        self.grads = (dWxh, dWhh, dWhy, dbh, dby)
        return loss

    def update_params_adam(self, learning_rate=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-8, t=1):
        """Adam optimizer - better than vanilla SGD"""
        dWxh, dWhh, dWhy, dbh, dby = self.grads
        
        params = [self.Wxh, self.Whh, self.Why, self.bh, self.by]
        grads = [dWxh, dWhh, dWhy, dbh, dby]
        m_params = [self.mWxh, self.mWhh, self.mWhy, self.mbh, self.mby]
        v_params = [self.vWxh, self.vWhh, self.vWhy, self.vbh, self.vby]
        
        for param, grad, m, v in zip(params, grads, m_params, v_params):
            m *= beta1
            m += (1 - beta1) * grad
            v *= beta2
            v += (1 - beta2) * (grad ** 2)
            
            m_corrected = m / (1 - beta1 ** t)
            v_corrected = v / (1 - beta2 ** t)
            
            param -= learning_rate * m_corrected / (np.sqrt(v_corrected) + epsilon)

    def train_step(self, inputs, targets, learning_rate=1e-3, t=1):
        self.forward(inputs)
        loss = self.lossandgrads(targets)
        self.update_params_adam(learning_rate, t=t)
        return loss

# Better training loop with learning rate scheduling and monitoring
def train_improved_rnn(model, X, Y, batch_size=32, epochs=100, initial_lr=1e-3):
    batch_gen = batch_generator(X, Y, batch_size)
    step = 0
    
    for epoch in range(epochs):
        x_batch, y_batch = next(batch_gen)
        total_loss = 0
        
        # Learning rate decay
        lr = initial_lr * (0.95 ** (epoch // 10))
        
        for x, y in zip(x_batch, y_batch):
            step += 1
            loss = model.train_step(x, y, learning_rate=lr, t=step)
            total_loss += loss
        
        avg_loss = total_loss / len(x_batch)
        
        # More frequent monitoring
        if epoch % 5 == 0:
            print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, LR: {lr:.6f}")
            
            # Sample with different temperatures
            seed_idx = char_to_idx.get('t', 0)  # Fallback to 0 if 't' not found
            
            print("Temperature 0.5 (conservative):")
            sample_idxs = model.sample(seed_idx, 100, temperature=0.5)
            generated = ''.join(idx_to_char[i] for i in sample_idxs)
            print(generated[:200])  # First 200 chars
            
            print("\nTemperature 1.0 (balanced):")
            sample_idxs = model.sample(seed_idx, 100, temperature=1.0)
            generated = ''.join(idx_to_char[i] for i in sample_idxs)
            print(generated[:200])
            
            print("-" * 60)

# Usage
# model = ImprovedRNN(vocab_size=len(char_to_idx), seq_length=100, hidden_size=256)  # Larger hidden size
# train_improved_rnn(model, X, Y, batch_size=32, epochs=200, initial_lr=1e-3)

In [76]:
new_model = ImprovedRNN(vocab_size, seq_length, hidden_size=256)  # Larger hidden size
train_improved_rnn(new_model, X, Y, batch_size=32, epochs=2000, initial_lr=0.01)


Epoch 0, Loss: 422.1057, LR: 0.010000
Temperature 0.5 (conservative):
q?', tux";!i :.u"?'dmb.b;?j: :c""?jf cy.zr
oeug.";is b.."?!!m:g.q?iombu.q;j! ka."?j! :u."?zzweu."?!'

Temperature 1.0 (balanced):
a?z:deu??sjjg!-mo?'itlbux-?ombu.;mi?olak"
zotbd"";cnmbu."qnormqa?z,'ltx,-?j, bu.-q!jptdc;dvom,spa?zj
------------------------------------------------------------
Epoch 5, Loss: 325.2590, LR: 0.010000
Temperature 0.5 (conservative):
eeeonshheigirseategeldnato o s

tneopstf io mshhsluo 
ht eeoaettt nir f ig m thhianrssd
tre  hhttaor

Temperature 1.0 (balanced):
'or awu-ir m 
w
niela hy
gq  bfxw?eotethanoanjc-ht anilarr e e 
ygte hoho nopdpjfm be aroonersc wcel
------------------------------------------------------------
Epoch 10, Loss: 315.2380, LR: 0.009500
Temperature 0.5 (conservative):
stte
agmartti opdt e
on sde 
o bst
ere aitre
s fptt  egrss e
osfet
et ens o , efntherogf t irondatte

Temperature 1.0 (balanced):
ath tyu sehorlonfl
lhtcost et orpidolasiit ane pnsgesolf r weosus
g
e c cud

KeyboardInterrupt: 

In [83]:
avg_title_length = df['cleaned_title'].str.len().mean()
print(f"Average title length: {avg_title_length}")

Average title length: 19.56691208315288
