In [1]:
import numpy as np

In [2]:
file_data = open('data_cllm1.txt','r').read()
data = list(file_data)
vocab = set(file_data)
data_size , vocab_size = len(data) , len(vocab)
print('length of data : ',data_size)
print('Vocabulary size : ',vocab_size)
print('Vocabulary : ',vocab)

length of data :  130
Vocabulary size :  24
Vocabulary :  {'e', 'p', 'u', 'd', 's', 'h', 'T', 'a', '.', 't', 'm', 'g', 'v', 'w', 'b', ',', 'l', 'k', 'i', 'y', 'n', 'r', 'o', ' '}


In [3]:
ix_to_chr = {ix : ch for ix,ch in enumerate(vocab)}
chr_to_ix = {ch : ix for ix,ch in enumerate(vocab)}

In [4]:
# defining the rnn model with 100 neurons in the hidden layer
hidden_neurons = 30
sequence_length = 5
# hyperparameters : Normalization
w_xh = np.random.randn(hidden_neurons,vocab_size) * 0.01 # input to hidden neurons
w_hh = np.random.randn(hidden_neurons,hidden_neurons) * 0.01 # hidden to hidden
w_hy = np.random.randn(vocab_size,hidden_neurons) * 0.01 # hidden to output
b_h = np.zeros((hidden_neurons,1))
b_y = np.zeros((vocab_size,1))

In [17]:
def feed_and_loss(inputs,targets,h_previous):
    x_states = {}
    h_states = {}
    y_states = {}
    prob_logits = {}
    h_states[-1] = h_previous  # initial hidden state before time stamp = 1
    # feeding the inputs
    loss = 0
    for t in range(len(inputs)):
        x_states[t] = np.zeros((vocab_size,1))
        x_states[t][inputs[t]] = 1
        h_states[t] = np.tanh(np.dot(w_xh,x_states[t]) + np.dot(w_hh,h_states[t-1]) + b_h)
        y_states[t] = np.dot(w_hy,h_states[t]) + b_y
        # probability distributions of the output :  softmax
        prob_logits[t] = np.exp(y_states[t])/np.sum(np.exp(y_states[t]))
        # summing up losses at each time stamp
        # cross entropy loss
        loss += -np.log(prob_logits[t][targets[t],0])

        # computing loss
    dw_xh = np.zeros_like(w_xh)
    dw_hh = np.zeros_like(w_hh)
    dw_hy = np.zeros_like(w_hy)
    db_h = np.zeros_like(b_h)
    db_y = np.zeros_like(b_y)
    #losses of the future time stamps as the loss wrt hidden state also influence the future hidden states 
    dhnext = np.zeros_like(b_h) 
    for t in reversed(range(len(inputs))):
        # calculating loss wrt output
        # dy = dL/dy
        dy = np.copy(prob_logits[t])  # same as the probabilities if not target class
        dy[targets[t]] -= 1 # Loss = probability - 1 if target class  
        dw_hy += np.dot(dy,h_states[t].T) # dL/dwhy = dL/dy * dy/dwhy
        db_y += dy                 # dL/dby = dL/dy * dy/dby
        # calculating loss wrt hidden state
        # dh(t) = dL/dy * dy/dh  + gradient wrt the future time stamps 
        dh = np.dot(w_hy.T,dy) + dhnext
        # backpropagating in the tanh non-linearity in hidden states
        # dh(z)/dp = (1 - h(z)^2) * dz/dp  where z is the input to the hidden state
        dhraw = (1 - h_states[t]**2) * dh
        db_h += dhraw
        dw_hh += np.dot(dhraw,h_states[t-1].T)
        dw_xh += np.dot(dhraw,x_states[t].T)
        # updating the gradients wrt the future hidden states
        # dL/dh(t+1)
        dhnext += np.dot(w_hh.T,dhraw)
    # clipping to prevent exploding gradients
    for dpara in [dw_xh,dw_hh,dw_hy,db_h,db_y]:
        np.clip(dpara,-5,5,out=dpara)
    return loss,dw_xh,dw_hh,dw_hy,db_h,db_y,h_states[len(inputs)-1]  # last hidden states the sequence input
        

In [18]:
def Generate_sequence(hidden_state,seed,n):
    x = np.zeros((vocab_size,1))
    x[seed] = 1
    out = [ix_to_chr[seed]]
    # generating output : 
    for t in range(n):
        hidden_state = np.tanh(np.dot(w_xh,x) + np.dot(w_hh,hidden_state) + b_h)
        y_out = np.dot(w_hy,hidden_state) + b_y
        y_out_p = np.exp(y_out)/np.sum(np.exp(y_out))
        # find selection of the next character : 
        ix = np.random.choice(range(vocab_size),p = y_out_p.ravel())  # ravel is used for flatting
        x = np.zeros_like(x)
        x[ix] = 1
        out.append(ix_to_chr[ix])
    return out

In [19]:
# epochs = 1000
# epoch = 0
pointer = 0  # data pointer initially set to 0 since we'll move from left to right in the data for training
iteration = 0 # iteration counter
# defining memory of gradients for each hyperparametres as used in adagrad optimization : 
mwxh = np.zeros_like(w_xh)
mwhh = np.zeros_like(w_hh)
mwhy = np.zeros_like(w_hy)
mbh = np.zeros_like(b_h)
mby = np.zeros_like(b_y)
smooth_loss = -np.log(1/vocab_size)*sequence_length

while True and smooth_loss >= 0.009:
    # check if we reached the end of data
    if pointer+sequence_length >= data_size or iteration == 0: 
        # Reset the RNN model i.e, it's hidden states
        pointer = 0
        h_previous = np.zeros((hidden_neurons,1))
        # epoch += 1
    # Prepare the inputs and targets
    inputs = [chr_to_ix[ch] for ch in data[pointer:pointer+sequence_length]]
    targets = [chr_to_ix[ch] for ch in data[pointer+1:pointer+sequence_length+1]]

    if iteration % 1000 == 0:
        # sample : 
        sample_ix = Generate_sequence(h_previous,inputs[0],50)
        txt = ''.join(sample_ix)
        print('---\n',txt,'---\n')

    # calculate the loss after feeding the inputs to the rnn:
    loss,dw_xh,dw_hh,dw_hy,db_h,db_y,h_previous = feed_and_loss(inputs,targets,h_previous)
    # exponentially moving average of the loss
    smooth_loss = smooth_loss * 0.999 + loss*0.001
    if iteration % 1000 == 0:
        print(f'--iter--{iteration}\t\t--loss--{smooth_loss}')
    # updating the hyper parametres using adagrad optimization : 
    for para , dpara, mem_para in zip([w_xh,w_hh,w_hy,b_h,b_y],[dw_xh,dw_hh,dw_hy,db_h,db_y],[mwxh,mwhh,mwhy,mbh,mby]) : 
        # updation  in memory : 
        mem_para += dpara**2
        para -= 0.1 * dpara / np.sqrt(mem_para + 1e-8) # adagrad
    pointer += sequence_length
    iteration += 1

---
 Tnmuoggotk,m,evt,mv knpkoehwvlbTilbwlop,hrtm,vys.ah ---

--iter--0		--loss--15.890268530596474
---
 Twitwotwinwtat y................................... ---

--iter--1000		--loss--9.261540183866181
---
 Twinkle littt.ondee why............................ ---

--iter--2000		--loss--3.982216913179254
---
 Twinkle thiwhnkle little star, hoa diamond in  ite  ---

--iter--3000		--loss--1.8313867181335064
---
 Twinkle twinkle lideattbove adero high, like lditt  ---

--iter--4000		--loss--0.8772979656197974
---
 Twinkle twinkle twinkle t.inkthe lia lit yky....... ---

--iter--5000		--loss--0.49518371132665595
---
 Twinkle little st igve lherlyt sky................. ---

--iter--6000		--loss--0.3137277541019518
---
 Twinkle litle stwoow i wonder wha dore .habore t yo ---

--iter--7000		--loss--0.21469563782771633
---
 Twinkle twinkle litle star, how i sowibwon lee s... ---

--iter--8000		--loss--0.1597420472040857
---
 Twinkle twind hotle wiat you diamond in the soatwbo ---

--iter--9000		

In [21]:
sample_ix = Generate_sequence(np.zeros((hidden_neurons,1)),chr_to_ix['T'],111)
txt = ''.join(sample_ix)
print(txt)

Twinkle twinkle little star, how i wonder what you are, up above the world so high, like a diamond in the sky...
