## Char-RNN

In this tutorial, we will build a char-rnn model for natural language generation. The training text is tokenized as a sequence of characters. After training, the model is able to output the probability distribution over the alphabet, therefore "predicting" the next character. By iterating this process, one can generate text snippets.

Char-RNN processes text sequences of arbitrary length, and the loss function makes use of ordinary Scala control-flow features during the training phase. Therefore it is an instance of dynamic neural network.

This implementation of Char-RNN is inspired by Andrej Karpathy's execellent blog post [The Unreasonable Effectiveness of Recurrent Neural Networks](https://karpathy.github.io/2015/05/21/rnn-effectiveness/) and [Python/numpy implementation](https://gist.github.com/karpathy/d4dee566867f8291f086).

## Importing dependencies

In [1]:
import $ivy.`org.nd4j:nd4j-native-platform:0.8.0`
import $ivy.`com.thoughtworks.deeplearning::plugins-builtins:2.0.0`
import $ivy.`org.plotly-scala::plotly-jupyter-scala:0.3.2`

import scala.math
import collection.immutable.IndexedSeq
import scala.io.Source
import scala.concurrent.ExecutionContext.Implicits.global
import scalaz.concurrent.Task
import scalaz.std.iterable._
import scalaz.syntax.all._
import com.thoughtworks.future._
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.ops.transforms.Transforms
import org.nd4j.linalg.api.ops.impl.indexaccum.IMax
import com.thoughtworks.deeplearning.plugins.DoubleLiterals
import com.thoughtworks.deeplearning.plugins.INDArrayLiterals
import com.thoughtworks.deeplearning.plugins.CumulativeDoubleLayers
import com.thoughtworks.deeplearning.plugins.DoubleTraining
import com.thoughtworks.deeplearning.plugins.CumulativeINDArrayLayers
import com.thoughtworks.deeplearning.plugins.INDArrayWeights
import com.thoughtworks.deeplearning.plugins.Operators
import com.thoughtworks.deeplearning.plugins.Logging
import com.thoughtworks.deeplearning.plugins.Builtins
import com.thoughtworks.feature.Factory
import plotly._
import plotly.element._
import plotly.layout._
import plotly.JupyterScala._

[32mimport [39m[36m$ivy.$                                    
[39m
[32mimport [39m[36m$ivy.$                                                      
[39m
[32mimport [39m[36m$ivy.$                                             

[39m
[32mimport [39m[36mscala.math
[39m
[32mimport [39m[36mcollection.immutable.IndexedSeq
[39m
[32mimport [39m[36mscala.io.Source
[39m
[32mimport [39m[36mscala.concurrent.ExecutionContext.Implicits.global
[39m
[32mimport [39m[36mscalaz.concurrent.Task
[39m
[32mimport [39m[36mscalaz.std.iterable._
[39m
[32mimport [39m[36mscalaz.syntax.all._
[39m
[32mimport [39m[36mcom.thoughtworks.future._
[39m
[32mimport [39m[36mscala.concurrent.Await
[39m
[32mimport [39m[36mscala.concurrent.duration.Duration
[39m
[32mimport [39m[36morg.nd4j.linalg.factory.Nd4j
[39m
[32mimport [39m[36morg.nd4j.linalg.api.ndarray.INDArray
[39m
[32mimport [39m[36morg.nd4j.linalg.ops.transforms.Transforms
[39m
[32mimport [39m[36morg.nd

## Preparing the corpus, setting up plugins & parameters

In [2]:
val data = "DeepLearning.scala"
val dataSize = data.size

val ixToChar = data.toSet.toArray
val charToIx = (for (i <- ixToChar.indices) yield (ixToChar(i), i)).toMap
val vocabSize = ixToChar.size

def oneOfK(c: Char) = Nd4j.zeros(vocabSize, 1).putScalar(charToIx(c), 1)

[36mdata[39m: [32mString[39m = [32m"DeepLearning.scala"[39m
[36mdataSize[39m: [32mInt[39m = [32m18[39m
[36mixToChar[39m: [32mArray[39m[[32mChar[39m] = [33mArray[39m([32m'e'[39m, [32m's'[39m, [32m'n'[39m, [32m'.'[39m, [32m'a'[39m, [32m'i'[39m, [32m'L'[39m, [32m'g'[39m, [32m'l'[39m, [32m'p'[39m, [32m'c'[39m, [32m'r'[39m, [32m'D'[39m)
[36mcharToIx[39m: [32mMap[39m[[32mChar[39m, [32mInt[39m] = [33mMap[39m(
  [32m'e'[39m -> [32m0[39m,
  [32m's'[39m -> [32m1[39m,
  [32m'n'[39m -> [32m2[39m,
  [32m'.'[39m -> [32m3[39m,
  [32m'a'[39m -> [32m4[39m,
  [32m'i'[39m -> [32m5[39m,
  [32m'L'[39m -> [32m6[39m,
  [32m'g'[39m -> [32m7[39m,
  [32m'l'[39m -> [32m8[39m,
  [32m'p'[39m -> [32m9[39m,
  [32m'c'[39m -> [32m10[39m,
[33m...[39m
[36mvocabSize[39m: [32mInt[39m = [32m13[39m
defined [32mfunction[39m [36moneOfK[39m

In [3]:
trait LearningRate extends INDArrayWeights {
    val learningRate: Double
    
    trait INDArrayOptimizerApi extends super.INDArrayOptimizerApi { this: INDArrayOptimizer =>
      override def delta: INDArray = super.delta mul learningRate
    }
    override type INDArrayOptimizer <: INDArrayOptimizerApi with Optimizer
  }

trait Adagrad extends INDArrayWeights {
    val eps: Double
    
    trait INDArrayWeightApi extends super.INDArrayWeightApi { this: INDArrayWeight =>
      var cache: Option[INDArray] = None
    }

    override type INDArrayWeight <: INDArrayWeightApi with Weight

    trait INDArrayOptimizerApi extends super.INDArrayOptimizerApi { this: INDArrayOptimizer =>
      private lazy val deltaLazy: INDArray = {
        import org.nd4s.Implicits._
        import weight._
        val delta0 = super.delta
        cache = Some(cache.getOrElse(Nd4j.zeros(delta0.shape: _*)) + delta0 * delta0)
        delta0 / (Transforms.sqrt(cache.get) + eps)
      }
      override def delta = deltaLazy
    }
    override type INDArrayOptimizer <: INDArrayOptimizerApi with Optimizer
  }

defined [32mtrait[39m [36mLearningRate[39m
defined [32mtrait[39m [36mAdagrad[39m

In [5]:
interp.load("""
  val hyperparameters = Factory[Adagrad with LearningRate with Builtins].newInstance(learningRate = 0.05, eps=1e-8)
""")

In [6]:
import hyperparameters.INDArrayWeight
import hyperparameters.DoubleLayer
import hyperparameters.INDArrayLayer
import hyperparameters.implicits._

[32mimport [39m[36mhyperparameters.INDArrayWeight
[39m
[32mimport [39m[36mhyperparameters.DoubleLayer
[39m
[32mimport [39m[36mhyperparameters.INDArrayLayer
[39m
[32mimport [39m[36mhyperparameters.implicits._[39m

In [7]:
val hiddenSize = 100
val seqLength = 25

val wxh = {
    import org.nd4s.Implicits._
    INDArrayWeight(Nd4j.randn(hiddenSize, vocabSize) * 0.01)
}

val whh = {
    import org.nd4s.Implicits._
    INDArrayWeight(Nd4j.randn(hiddenSize, hiddenSize) * 0.01)
}

val why = {
    import org.nd4s.Implicits._
    INDArrayWeight(Nd4j.randn(vocabSize, hiddenSize) * 0.01)
}

val bh = INDArrayWeight(Nd4j.zeros(hiddenSize, 1))
val by = INDArrayWeight(Nd4j.zeros(vocabSize, 1))

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


[36mhiddenSize[39m: [32mInt[39m = [32m100[39m
[36mseqLength[39m: [32mInt[39m = [32m25[39m
[36mwxh[39m: [32mObject[39m with [32mhyperparameters[39m.[32mINDArrayWeightApi[39m with [32mhyperparameters[39m.[32mWeightApi[39m with [32mhyperparameters[39m.[32mWeightApi[39m with [32mhyperparameters[39m.[32mINDArrayWeightApi[39m = Weight[fullName=$sess.cmd6Wrapper.Helper.wxh]
[36mwhh[39m: [32mObject[39m with [32mhyperparameters[39m.[32mINDArrayWeightApi[39m with [32mhyperparameters[39m.[32mWeightApi[39m with [32mhyperparameters[39m.[32mWeightApi[39m with [32mhyperparameters[39m.[32mINDArrayWeightApi[39m = Weight[fullName=$sess.cmd6Wrapper.Helper.whh]
[36mwhy[39m: [32mObject[39m with [32mhyperparameters[39m.[32mINDArrayWeightApi[39m with [32mhyperparameters[39m.[32mWeightApi[39m with [32mhyperparameters[39m.[32mWeightApi[39m with [32mhyperparameters[39m.[32mINDArrayWeightApi[39m = Weight[fullName=$sess.cmd6Wrapper.Helper.why

## Implementing the neural network

In [8]:
def tanh(x: INDArrayLayer): INDArrayLayer = {
  val exp_x = hyperparameters.exp(x)
  val exp_nx = hyperparameters.exp(-x)
  (exp_x - exp_nx) / (exp_x + exp_nx)
}

defined [32mfunction[39m [36mtanh[39m

In [9]:
def charRNN(x: INDArray, y: INDArray, hprev: INDArrayLayer): (DoubleLayer, INDArrayLayer, INDArrayLayer) = {
    val hnext = tanh(wxh.dot(x) + whh.dot(hprev) + bh)
    val yraw = why.dot(hnext) + by
    val yraw_exp = hyperparameters.exp(yraw)
    val prob = yraw_exp / yraw_exp.sum
    val loss = -hyperparameters.log((prob * y).sum)
    (loss, prob, hnext)
}

defined [32mfunction[39m [36mcharRNN[39m

In [11]:
val batches = data.zip(data.tail).grouped(seqLength).toVector

type WithHiddenLayer[A] = (A, INDArrayLayer)
type Batch = IndexedSeq[(Char, Char)]
type Losses = Vector[Double]

def singleBatch(batch: WithHiddenLayer[Batch]): WithHiddenLayer[DoubleLayer] = {
  batch match {
    case (batchseq, hprev) => batchseq.foldLeft((DoubleLayer(0.0.forward), hprev)) {
      (bstate: WithHiddenLayer[DoubleLayer], xy: (Char, Char)) =>
        (bstate, xy) match {
          case ((tot, localhprev), (x, y)) => {
            charRNN(oneOfK(x), oneOfK(y), localhprev) match {
              case (localloss, _, localhnext) => {
                (tot + localloss, localhnext)
              }
            }
          }
        }
    }
  }
}

def initH = INDArrayLayer(Nd4j.zeros(hiddenSize, 1).forward)

def singleRound(initprevloss: Losses): Future[Losses] =
  (batches.foldLeftM((initprevloss, initH)) {
    (bstate: WithHiddenLayer[Losses], batch: Batch) =>
      bstate match {
        case (prevloss, hprev) => singleBatch(batch, hprev) match {
          case (bloss, hnext) => bloss.train.map {
            (blossval: Double) => {
                val nloss = prevloss.last * 0.999 + blossval * 0.001
                val loss_seq = prevloss :+ prevloss.last * 0.999 + blossval * 0.001
                (loss_seq, hnext)
            }
          }
        }
      }
  }).map {
    (fstate: WithHiddenLayer[Losses]) =>
      fstate match {
        case (floss, _) => floss
      }
  }

def allRounds: Future[Losses] = (0 until 2048).foldLeftM(Vector(-math.log(1.0 / vocabSize) * seqLength)) {
  (ploss: Losses, round: Int) => {
      singleRound(ploss)
  }
}

[36mbatches[39m: [32mVector[39m[[32mIndexedSeq[39m[([32mChar[39m, [32mChar[39m)]] = [33mVector[39m(
  [33mVector[39m(
    ([32m'D'[39m, [32m'e'[39m),
    ([32m'e'[39m, [32m'e'[39m),
    ([32m'e'[39m, [32m'p'[39m),
    ([32m'p'[39m, [32m'L'[39m),
    ([32m'L'[39m, [32m'e'[39m),
    ([32m'e'[39m, [32m'a'[39m),
    ([32m'a'[39m, [32m'r'[39m),
    ([32m'r'[39m, [32m'n'[39m),
    ([32m'n'[39m, [32m'i'[39m),
    ([32m'i'[39m, [32m'n'[39m),
[33m...[39m
defined [32mtype[39m [36mWithHiddenLayer[39m
defined [32mtype[39m [36mBatch[39m
defined [32mtype[39m [36mLosses[39m
defined [32mfunction[39m [36msingleBatch[39m
defined [32mfunction[39m [36minitH[39m
defined [32mfunction[39m [36msingleRound[39m
defined [32mfunction[39m [36mallRounds[39m

## Training the model and using it to generate text

In [12]:
def unsafePerformFuture[A](f: Future[A]): A = Await.result(f.toScalaFuture, Duration.Inf)

val losses = unsafePerformFuture(allRounds)

plotly.JupyterScala.init()

Scatter(losses.indices, losses).plot(title = "Smooth loss by time")

defined [32mfunction[39m [36munsafePerformFuture[39m
[36mlosses[39m: [32mLosses[39m = [33mVector[39m(
  [32m64.12373393653841[39m,
  [32m64.10321652752525[39m,
  [32m64.10335427716902[39m,
  [32m64.21191326651677[39m,
  [32m64.2612733831522[39m,
  [32m64.27077250715435[39m,
  [32m64.28899924603446[39m,
  [32m64.28159751948861[39m,
  [32m64.25882909768758[39m,
  [32m64.23143043005247[39m,
  [32m64.20029390599379[39m,
[33m...[39m
[36mres11_3[39m: [32mString[39m = [32m"plot-1602067463"[39m

In [13]:
def genIdx(v: INDArray): Int = Nd4j.getExecutioner().execAndReturn(new IMax(v)).getFinalResult()

def generate(seed: Char, n: Int): Future[String] = ((0 until n).foldLeftM((seed.toString, initH)) {
    (st: (String, INDArrayLayer), i: Int) =>
        st match {
            case (tot, hprev) => {
                val x = oneOfK(tot.last)
                charRNN(x, x, hprev) match {
                    case (_, prob, hnext) =>
                        prob.predict.flatMap { (probv: INDArray) =>
                            val nidx = genIdx(probv)
                            val nc = ixToChar(nidx)
                            Future.now(tot + nc.toString, hnext)
                        }
                }
            }
        }
}).map { (st: (String, INDArrayLayer)) =>
  st match {
    case (r, _) => r
  }
}

defined [32mfunction[39m [36mgenIdx[39m
defined [32mfunction[39m [36mgenerate[39m

In [14]:
unsafePerformFuture(generate('D', 128))

[36mres13[39m: [32mString[39m = [32m"DeepLearning.scalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalalal"[39m