In [1]:
import numpy as np

In [35]:
file_data = open('data_cllm1.txt','r').read()
data = list(file_data)
vocab = set(file_data)
data_size , vocab_size = len(data) , len(vocab)
print('length of data : ',data_size)
print('Vocabulary size : ',vocab_size)
print('Vocabulary : ',vocab)

length of data :  130
Vocabulary size :  24
Vocabulary :  {'b', 'h', '.', 't', 'm', ',', 'i', 'a', 'p', 'g', ' ', 'd', 's', 'r', 'n', 'y', 'k', 'u', 'e', 'v', 'o', 'w', 'l', 'T'}


In [36]:
ix_to_chr = {ix : ch for ix,ch in enumerate(vocab)}
chr_to_ix = {ch : ix for ix,ch in enumerate(vocab)}

In [37]:
# defining the rnn model with 100 neurons in the hidden layer
hidden_neurons = 30
sequence_length = 5
# hyperparameters : Normalization
wxh = np.random.randn(hidden_neurons,vocab_size) * 0.01
whh = np.random.randn(hidden_neurons,hidden_neurons) * 0.01
why = np.random.randn(vocab_size,hidden_neurons) * 0.01
bh = np.zeros((hidden_neurons,1))
by = np.zeros((vocab_size,1))

In [38]:
def feed_and_loss(inputs,targets,h_previous):
    xs = {}
    hs = {}
    ys = {}
    ps = {}
    hs[-1] = h_previous  # initial hidden state before time stamp = 1
    # feeding the inputs
    loss = 0
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1))
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(wxh,xs[t]) + np.dot(whh,hs[t-1]) + bh)
        ys[t] = np.dot(why,hs[t]) + by
        # probability distributions of the output :  softmax
        ps[t] = np.exp(ys[t])/np.sum(np.exp(ys[t]))
        # summing up losses at each time stamp
        # cross entropy loss
        loss += -np.log(ps[t][targets[t],0])

        # computing loss
    dwxh = np.zeros_like(wxh)
    dwhh = np.zeros_like(whh)
    dwhy = np.zeros_like(why)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    #losses of the future time stamps as the loss wrt hidden state also influence the future hidden states 
    dhnext = np.zeros_like(bh) 
    for t in reversed(range(len(inputs))):
        # calculating loss wrt output
        # dy = dL/dy
        dy = np.copy(ps[t])  # same as the probabilities if not target class
        dy[targets[t]] -= 1 # Loss = probability - 1 if target class  
        dwhy += np.dot(dy,hs[t].T) # dL/dwhy = dL/dy * dy/dwhy
        dby += dy                 # dL/dby = dL/dy * dy/dby
        # calculating loss wrt hidden state
        # dh(t) = dL/dy * dy/dh  + gradient wrt the future time stamps 
        dh = np.dot(why.T,dy) + dhnext
        # backpropagating in the tanh non-linearity in hidden states
        # dh(z)/dp = (1 - h(z)^2) * dz/dp  where z is the input to the hidden state
        dhraw = (1 - hs[t]**2) * dh
        dbh += dhraw
        dwhh += np.dot(dhraw,hs[t-1].T)
        dwxh += np.dot(dhraw,xs[t].T)
        # updating the gradients wrt the future hidden states
        # dL/dh(t+1)
        dhnext += np.dot(whh.T,dhraw)
    # clipping to prevent exploding gradients
    for dpara in [dwxh,dwhh,dwhy,dbh,dby]:
        np.clip(dpara,-5,5,out=dpara)
    return loss,dwxh,dwhh,dwhy,dbh,dby,hs[len(inputs)-1]  # last hidden states the sequence input
        

In [39]:
def sample(hidden_state,seed,n):
    x = np.zeros((vocab_size,1))
    x[seed] = 1
    out = [ix_to_chr[seed]]
    # generating output : 
    for t in range(n):
        hidden_state = np.tanh(np.dot(wxh,x) + np.dot(whh,hidden_state) + bh)
        y_out = np.dot(why,hidden_state) + by
        y_out_p = np.exp(y_out)/np.sum(np.exp(y_out))
        # find selection of the next character : 
        ix = np.random.choice(range(vocab_size),p = y_out_p.ravel())  # ravel is used for flatting
        x = np.zeros_like(x)
        x[ix] = 1
        out.append(ix_to_chr[ix])
    return out

In [40]:
# epochs = 1000
# epoch = 0
p = 0  # data pointer initially set to 0 since we'll move from left to right in the data for training
iteration = 0 # iteration counter
# defining memory of gradients for each hyperparametres as used in adagrad optimization : 
mwxh = np.zeros_like(wxh)
mwhh = np.zeros_like(whh)
mwhy = np.zeros_like(why)
mbh = np.zeros_like(bh)
mby = np.zeros_like(by)
smooth_loss = -np.log(1/vocab_size)*sequence_length

while True and smooth_loss >= 0.009:
    # check if we reached the end of data
    if p+sequence_length >= data_size or iteration == 0: 
        # Reset the RNN model i.e, it's hidden states
        p = 0
        h_previous = np.zeros((hidden_neurons,1))
        # epoch += 1
    # Prepare the inputs and targets
    inputs = [chr_to_ix[ch] for ch in data[p:p+sequence_length]]
    targets = [chr_to_ix[ch] for ch in data[p+1:p+sequence_length+1]]

    if iteration % 1000 == 0:
        # sample : 
        sample_ix = sample(h_previous,inputs[0],50)
        txt = ''.join(sample_ix)
        print('---\n',txt,'---\n')

    # calculate the loss after feeding the inputs to the rnn:
    loss,dwxh,dwhh,dwhy,dbh,dby,h_previous = feed_and_loss(inputs,targets,h_previous)
    # exponentially moving average of the loss
    smooth_loss = smooth_loss * 0.999 + loss*0.001
    if iteration % 1000 == 0:
        print(f'--iter--{iteration}\t\t--loss--{smooth_loss}')
    # updating the hyper parametres using adagrad optimization : 
    for para , dpara, mem_para in zip([wxh,whh,why,bh,by],[dwxh,dwhh,dwhy,dbh,dby],[mwxh,mwhh,mwhy,mbh,mby]) : 
        # updation  in memory : 
        mem_para += dpara**2
        para -= 0.1 * dpara / np.sqrt(mem_para + 1e-8) # adagrad
    p += sequence_length
    iteration += 1

---
 TTvwt.ae,ws,tm,g,yguaiovthuyyhayrhsee.aohte,taaaigh ---

--iter--0		--loss--15.890269026945385
---
 Twin le like de riamove the wi whar ih tThid w woa  ---

--iter--1000		--loss--9.47450519527242
---
 Twinkle statdeikw the starere  whake, limoee kw ae  ---

--iter--2000		--loss--4.043688710504603
---
 Twinkle little star, how ihwtnd w wwake ar, how in  ---

--iter--3000		--loss--1.6975580774825012
---
 Tamond arld so voth, like a diamond in the swprp ar ---

--iter--4000		--loss--0.7628061938531507
---
 Twinkle twikwl mowhi  i he s whyw nkle little star, ---

--iter--5000		--loss--0.6248169399951619
---
 Twinkle little star, how i what you are, up above t ---

--iter--6000		--loss--0.35524066449877695
---
 Twinkle twinkle little star, how i wonder what you  ---

--iter--7000		--loss--0.2525159016044389
---
 Twinkle twinkle like, up atar, hohh s winkle twinkl ---

--iter--8000		--loss--0.2111693600398737
---
 Twinkle twinkle twinde,rlate, uperohw i wonder ntat ---

--iter--9000		--

In [58]:
sample_ix = sample(np.zeros((hidden_neurons,1)),chr_to_ix['T'],111)
txt = ''.join(sample_ix)
print(txt)

Twinkle twinkle little star, how i wonder what you are, up above the world so high, like a diamond in the sky...
