# RNN from scratch

In [1]:
import glob
import torch
from importlib import reload

%load_ext autoreload
%autoreload 2
from utils.files import *

%load_ext autoreload
%autoreload 2
from src.RNN import RNN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data preparation

In [2]:
article_paths = glob.glob("data/articles/*.txt")

corpus = [
    read_file(article_path) for article_path in article_paths
]

# Wikipedia loves non-ascii charachters, the vocab will skyrocket if I didn't limit it to ascii
corpus = [
    ''.join(c.lower() for c in article if ord(c) < 128) for article in corpus
]
print(corpus[1])
print("number of articles:", len(corpus))

the railroad commission cases, 116 u.s. 307 (1886), is a united states supreme court case concerning the power of states to set transportation charges of railroad companies. the court held that the fixing of freight and passenger rates in railroad transportation was a permissible exercise of state police power. in 1884, the legislature of mississippi passed a statute which established a state commission with the power to impose transportation rates on private railroad companies. the companies had signed a charter contract with the state, authorizing them to set their own rates. the farmers' loan & trust company, a new york corporation, brought suit against the commission on behalf of mobile & ohio railroad company, to enjoin enforcement of the statute. in upholding the statute, the court held that the state charter did not divest the state of the ultimate power to determine reasonable rates. in stone v. mississippi (1879), the court had said that rate regulation fell outside of the sta

In [3]:
full_text = ''.join(corpus)
unique_characters = sorted(list(set(full_text)))

In [4]:
# Picking an eos token & making sure it's not there
eos = "<eos>"
if eos in unique_characters:
    print("not suitable")
else:
    print("suitable")
    unique_characters.append(eos)

suitable


In [5]:
input_size = len(unique_characters)
print(f"Vocab size: {input_size}")

Vocab size: 70


In [6]:
token_to_index = {ch: i for i, ch in enumerate(unique_characters)}
index_to_token = {i: ch for i, ch in enumerate(unique_characters)}

In [7]:
sequences = []
for article in corpus:
    # Convert article characters to token indices
    sequence = [token_to_index[token] for token in article]
    # Append the end-of-sequence token index
    sequence.append(token_to_index[eos])
    # Add the complete sequence to the list of sequences
    sequences.append(sequence)

print(sequences[0])

[17, 16, 23, 21, 0, 46, 43, 50, 47, 52, 39, 12, 0, 54, 56, 53, 60, 47, 57, 47, 53, 52, 39, 50, 0, 42, 43, 57, 47, 45, 52, 39, 58, 47, 53, 52, 0, 17, 25, 18, 22, 0, 57, 41, 12, 0, 47, 57, 0, 39, 0, 57, 58, 53, 52, 63, 0, 43, 53, 57, 0, 39, 57, 58, 43, 56, 53, 47, 42, 0, 44, 56, 53, 51, 0, 58, 46, 43, 0, 53, 59, 58, 43, 56, 0, 56, 43, 45, 47, 53, 52, 57, 0, 53, 44, 0, 58, 46, 43, 0, 39, 57, 58, 43, 56, 53, 47, 42, 0, 40, 43, 50, 58, 12, 0, 39, 54, 54, 56, 53, 62, 47, 51, 39, 58, 43, 50, 63, 0, 19, 20, 0, 49, 47, 50, 53, 51, 43, 58, 43, 56, 57, 0, 8, 18, 17, 0, 51, 47, 50, 43, 57, 9, 0, 47, 52, 0, 42, 47, 39, 51, 43, 58, 43, 56, 14, 0, 47, 58, 0, 61, 39, 57, 0, 42, 47, 57, 41, 53, 60, 43, 56, 43, 42, 0, 53, 52, 0, 18, 25, 0, 57, 43, 54, 58, 43, 51, 40, 43, 56, 0, 17, 25, 18, 22, 12, 0, 40, 63, 0, 39, 57, 58, 56, 53, 52, 53, 51, 43, 56, 0, 45, 56, 47, 45, 53, 56, 63, 0, 52, 43, 59, 48, 51, 47, 52, 0, 39, 58, 0, 58, 46, 43, 0, 57, 47, 51, 43, 47, 64, 0, 53, 40, 57, 43, 56, 60, 39, 58, 53, 5

In [8]:
input_size = len(unique_characters)
hidden_size = 100
output_size = len(unique_characters)
model = RNN(input_size=input_size, hidden_size=hidden_size, output_size=output_size,
            token_to_index=token_to_index, index_to_token=index_to_token,
            index_to_embedding=None, one_hot=True, eos=eos)

In [None]:
checkpoints = model.train(sequences=sequences, epochs=2000, checkpoints=[500, 1000, 2000])

Epoch:   0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
for i in range(3):
    model.load_checkpoint(checkpoint = checkpoints[i])
    text_example = "The U.S is the root of all evil in the"
    output = model.predict(input_sequence=sequences[0][:100])
    print(f"Experiment with {i} iterations:\n{output}")

# Word tokens