In [3]:
"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy), minor adjustments 
(for more python 3.6 like) done by me
BSD License
"""
import numpy as np
from pathlib import Path

# data I/O
DATA_PATH=Path('data')
data=(DATA_PATH/'tode_oigus.txt').open().read()
#data = open('input.txt', 'r').read()  # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'data has {data_size} characters, {vocab_size} unique.')
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

# hyperparameters
hidden_size = 100  # size of hidden layer of neurons
seq_length = 25  # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh((Wxh @ xs[t]) + (Whh @ hs[t - 1]) + bh)  # hidden state
        ys[t] = (Why @ hs[t]) + by  # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
        loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += dy @ hs[t].T
        dby += dy
        dh = (Why.T @ dy) + dhnext  # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh +=  dhraw @ xs[t].T
        dWhh += dhraw @ hs[t - 1].T
        dhnext = Whh.T @ dhraw
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(Wxh@ x + Whh @ h + bh)
        y = Why @ h + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0

iters=100000
for iter in range(iters):
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p + seq_length + 1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size, 1))  # reset RNN memory
        p = 0  # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
    targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

    # sample from the model now and then
    if n % 10000 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print(f'----\n {txt} \n----')

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 10000 == 0: 
        print(f'iter {n}, loss: {smooth_loss}')

    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                  [dWxh, dWhh, dWhy, dbh, dby],
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

    p += seq_length  # move data pointer
    n += 1  # iteration counter

data has 13868 characters, 59 unique.
----
 P–trpAL)!!E)
iSnigAüj![-Ln.iõU,:Mte!um––dõ-]iaH(mÜUNu1r„!Js: õäEm)ÄVH!KV
…o
ÜlEg:l–üHaS:-.UiöknvÜNIglMLÄJPölm“b1Av;üEOPÜOöE,„öT1h)
TvVÄs ?
HT;Õ…[;(ElhvLh(!.gÄj
norgÄ)NÜUg1N1äU1JmbhtO;sPdNõvtNdrah[aN!Õ 
----
iter 0, loss: 101.93843413844186
----
 eleld ja ltine paadas omuve olis poope maikpie: üonsis elega moja õrvik ta päres koüvg. ridil õiku, na väemeed aada. Nolel veida ja tulivena saine milta, vehmaret tuina.“

Vaekum hoel lepvadarõsti veg 
----
iter 10000, loss: 52.503336684034025
----
 ud – mau täisest ind põllu peaes. Jõlt onke mäga, pela, ela ävid õädal pealis koli jutta, kõi umavarvigi koju päkarel Jii vahtaest. Siis estaks neistis askama sena sii tuid, vätuvad mänd nüüdanud ütna 
----
iter 20000, loss: 47.200038578298724
----
 tebiramad kätjapäga sune tuid.

„Aga õuja ja vargu oksud sõub püünm ikigemääe sedilt möödid olte ja oli emas karjatat vastulde ja ju henamma. Soom kasstud õikus mene sadas nagoks olne põlbaks vaalaksa 
----
ite