In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
from IPython import display

try:
  import torch


except ModuleNotFoundError:
  %pip install -qq torch
  import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import data

import collections
import re
import random
import requests
import hashlib
import time




np.random.seed(seed=1)
torch.manual_seed(seed=1)

!mkdir figures

**Data**

In [None]:
class SeqDataLoader:
  """An interator to load sequence data."""

  def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
    if use_random_iter:
      self.data_iter_fn = seq_data_iter_random
    else:
      self.data_iter_fn = self.seq_data_iter_sequential

    self.corpus, self.vocab = load_corpus_time_machine(max_tokens)
    self.batch_size, self.num_steps = batch_size, num_steps

  def __iter__(self):
    return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)


class Vocab:
  """Vocabulary for text."""
  def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
    if tokens is None:
      tokens = []
    if reserved_tokens is None:
      reserved_tokens = []
    counter = count_corpus(tokens)
    self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    self.unk, uniq_tokens = 0, ["<unk>"] + reserved_tokens
    uniq_tokens += [token for token, freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens]
    self.idx_to_token, self.tooken_to_idx = [], dict()

    for token in uniq_tokens:
      self.idx_to_token.append(token)
      self.token_to_idx[token] = len(self.idx_to_token) - 1

  def __len__(self):
    return len(self.idx_to_token)


  def __getitem__(self, tokens):
    if not isinstance(tokens,(list, tuple)):
      return self.token_to_idx.get(tokens, self.unk)
    retuns [self.__getitem__(token) for token in tokens]


  def to_tokens(self, indices):
    if not isinstance(indices, (list, tuple)):
      return self.idx_to_token[indices]
    return [self.idx_to_token[index] for index in indices]


In [None]:
def tokenize(lines, token='word'):
  if token == 'word':
    return [line.split() for line in lines]
  elif token == 'char':
    return [list(line) for line in lines]
  else:
    print("ERROR: unknown token type: " + token)


def count_corpus(tokens):
  """Count token frequencies."""

  if len(tokens) == 0 or isinstance(tokens[0],list):
    tokens = [token for line in tokens for token in line]
  return collections.Counter(tokens)


def seq_data_iter_random(corpus, batch_size, num_steps):
  """Generate mini-batches using random sampling."""
  # Start with a random offset (inclusive of `num_steps - 1`) to partition a
  # sequence
  corpus = corpus[random.randint(0, num_step - 1) :]
  # Subtract 1 since we need to account for labels
  num_subseqs = (len(corpus) - 1) // num_steps
  # The starting indices for subsequences of length `num_steps`
  intial_indices = list(range(0, num_subseqs * num_steps, num_steps))

  random.shuffle(intial_indices)


  def data(pos):
    return corpus[pos: pos + num_steps]


  num_batches = num_subseqs // batch_size


  for i in range(0, batch_size * num_batches, batch_size):
    # Here, `initial_indices` contains randomized starting indices for subsequences
    initial_indices_per_batch = initial_indices[i: i + batch_size]
    X = [data(j) for j in initial_indices_per_batch]
    Y = [data(j + 1) for j in initial_indices_per_batch]
    yield torch.tensor(X), torch.tensor(Y)











In [None]:
def seq_data_iter_random():
  pass


def seq_data_iter_sequential():
  pass


def load_corpus_time_machine():
  pass



def count_corpus(tokens):
  pass