diff --git a/monai/networks/nets/autoencoder.py b/monai/networks/nets/autoencoder.py
index b7dc309b71..f4a0451dc7 100644
--- a/monai/networks/nets/autoencoder.py
+++ b/monai/networks/nets/autoencoder.py
@@ -23,7 +23,69 @@
 
 class AutoEncoder(nn.Module):
     """
-    Base class for the architecture implementing :py:class:`monai.networks.nets.VarAutoEncoder`.
+    Simple definition of an autoencoder and base class for the architecture implementing
+    :py:class:`monai.networks.nets.VarAutoEncoder`. The network is composed of an encode sequence of blocks, followed
+    by an intermediary sequence of blocks, and finally a decode sequence of blocks. The encode and decode blocks are
+    default :py:class:`monai.networks.blocks.Convolution` instances with the encode blocks having the given stride
+    and the decode blocks having transpose convolutions with the same stride. If `num_res_units` is given residual
+    blocks are used instead.
+
+    By default the intermediary sequence is empty but if `inter_channels` is given to specify the output channels of
+    blocks then this will be become a sequence of Convolution blocks or of residual blocks if `num_inter_units` is
+    given. The optional parameter `inter_dilations` can be used to specify the dilation values of the convolutions in
+    these blocks, this allows a network to use dilated kernels in this  middle section. Since the intermediary section
+    isn't meant to change the size of the output the strides for all these kernels is 1.
+
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        channels: sequence of channels. Top block first. The length of `channels` should be no less than 2.
+        strides: sequence of convolution strides. The length of `stride` should equal to `len(channels) - 1`.
+        kernel_size: convolution kernel size, the value(s) should be odd. If sequence,
+            its length should equal to dimensions. Defaults to 3.
+        up_kernel_size: upsampling convolution kernel size, the value(s) should be odd. If sequence,
+            its length should equal to dimensions. Defaults to 3.
+        num_res_units: number of residual units. Defaults to 0.
+        inter_channels: sequence of channels defining the blocks in the intermediate layer between encode and decode.
+        inter_dilations: defines the dilation value for each block of the intermediate layer. Defaults to 1.
+        num_inter_units: number of residual units for each block of the intermediate layer. Defaults to 0.
+        act: activation type and arguments. Defaults to PReLU.
+        norm: feature normalization type and arguments. Defaults to instance norm.
+        dropout: dropout ratio. Defaults to no dropout.
+        bias: whether to have a bias term in convolution blocks. Defaults to True.
+            According to `Performance Tuning Guide <https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html>`_,
+            if a conv layer is directly followed by a batch norm layer, bias should be False.
+
+    .. deprecated:: 0.6.0
+        ``dimensions`` is deprecated, use ``spatial_dims`` instead.
+
+    Examples::
+
+        from monai.networks.nets import AutoEncoder
+
+        # 3 layers each down/up sampling their inputs by a factor 2 with no intermediate layer
+        net = AutoEncoder(
+            spatial_dims=2,
+            in_channels=1,
+            out_channels=1,
+            channels=(2, 4, 8),
+            strides=(2, 2, 2)
+        )
+
+        # 1 layer downsampling by 2, followed by a sequence of residual units with 2 convolutions defined by
+        # progressively increasing dilations, then final upsample layer
+        net = AutoEncoder(
+                spatial_dims=2,
+                in_channels=1,
+                out_channels=1,
+                channels=(4,),
+                strides=(2,),
+                inter_channels=(8, 8, 8),
+                inter_dilations=(1, 2, 4),
+                num_inter_units=2
+            )
+
     """
 
     @deprecated_arg(
@@ -48,13 +110,6 @@ def __init__(
         bias: bool = True,
         dimensions: Optional[int] = None,
     ) -> None:
-        """
-        Initialize the AutoEncoder.
-
-        .. deprecated:: 0.6.0
-            ``dimensions`` is deprecated, use ``spatial_dims`` instead.
-
-        """
 
         super().__init__()
         self.dimensions = spatial_dims if dimensions is None else dimensions
@@ -87,6 +142,9 @@ def __init__(
     def _get_encode_module(
         self, in_channels: int, channels: Sequence[int], strides: Sequence[int]
     ) -> Tuple[nn.Sequential, int]:
+        """
+        Returns the encode part of the network by building up a sequence of layers returned by `_get_encode_layer`.
+        """
         encode = nn.Sequential()
         layer_channels = in_channels
 
@@ -98,6 +156,10 @@ def _get_encode_module(
         return encode, layer_channels
 
     def _get_intermediate_module(self, in_channels: int, num_inter_units: int) -> Tuple[nn.Module, int]:
+        """
+        Returns the intermediate block of the network which accepts input from the encoder and whose output goes
+        to the decoder.
+        """
         # Define some types
         intermediate: nn.Module
         unit: nn.Module
@@ -145,6 +207,9 @@ def _get_intermediate_module(self, in_channels: int, num_inter_units: int) -> Tu
     def _get_decode_module(
         self, in_channels: int, channels: Sequence[int], strides: Sequence[int]
     ) -> Tuple[nn.Sequential, int]:
+        """
+        Returns the decode part of the network by building up a sequence of layers returned by `_get_decode_layer`.
+        """
         decode = nn.Sequential()
         layer_channels = in_channels
 
@@ -156,7 +221,9 @@ def _get_decode_module(
         return decode, layer_channels
 
     def _get_encode_layer(self, in_channels: int, out_channels: int, strides: int, is_last: bool) -> nn.Module:
-
+        """
+        Returns a single layer of the encoder part of the network.
+        """
         mod: nn.Module
         if self.num_res_units > 0:
             mod = ResidualUnit(
@@ -187,7 +254,9 @@ def _get_encode_layer(self, in_channels: int, out_channels: int, strides: int, i
         return mod
 
     def _get_decode_layer(self, in_channels: int, out_channels: int, strides: int, is_last: bool) -> nn.Sequential:
-
+        """
+        Returns a single layer of the decoder part of the network.
+        """
         decode = nn.Sequential()
 
         conv = Convolution(
diff --git a/monai/networks/nets/classifier.py b/monai/networks/nets/classifier.py
index 92fee4f566..a1f913ea23 100644
--- a/monai/networks/nets/classifier.py
+++ b/monai/networks/nets/classifier.py
@@ -25,6 +25,19 @@ class Classifier(Regressor):
     Defines a classification network from Regressor by specifying the output shape as a single dimensional tensor
     with size equal to the number of classes to predict. The final activation function can also be specified, eg.
     softmax or sigmoid.
+
+    Args:
+        in_shape: tuple of integers stating the dimension of the input tensor (minus batch dimension)
+        classes: integer stating the dimension of the final output tensor
+        channels: tuple of integers stating the output channels of each convolutional layer
+        strides: tuple of integers stating the stride (downscale factor) of each convolutional layer
+        kernel_size: integer or tuple of integers stating size of convolutional kernels
+        num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
+        act: name or type defining activation layers
+        norm: name or type defining normalization layers
+        dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
+        bias: boolean stating if convolution layers should have a bias component
+        last_act: name defining the last activation layer
     """
 
     def __init__(
@@ -41,20 +54,6 @@ def __init__(
         bias: bool = True,
         last_act: Optional[str] = None,
     ) -> None:
-        """
-        Args:
-            in_shape: tuple of integers stating the dimension of the input tensor (minus batch dimension)
-            classes: integer stating the dimension of the final output tensor
-            channels: tuple of integers stating the output channels of each convolutional layer
-            strides: tuple of integers stating the stride (downscale factor) of each convolutional layer
-            kernel_size: integer or tuple of integers stating size of convolutional kernels
-            num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
-            act: name or type defining activation layers
-            norm: name or type defining normalization layers
-            dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
-            bias: boolean stating if convolution layers should have a bias component
-            last_act: name defining the last activation layer
-        """
         super().__init__(in_shape, (classes,), channels, strides, kernel_size, num_res_units, act, norm, dropout, bias)
 
         if last_act is not None:
@@ -68,6 +67,18 @@ class Discriminator(Classifier):
     """
     Defines a discriminator network from Classifier with a single output value and sigmoid activation by default. This
     is meant for use with GANs or other applications requiring a generic discriminator network.
+
+    Args:
+        in_shape: tuple of integers stating the dimension of the input tensor (minus batch dimension)
+        channels: tuple of integers stating the output channels of each convolutional layer
+        strides: tuple of integers stating the stride (downscale factor) of each convolutional layer
+        kernel_size: integer or tuple of integers stating size of convolutional kernels
+        num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
+        act: name or type defining activation layers
+        norm: name or type defining normalization layers
+        dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
+        bias: boolean stating if convolution layers should have a bias component
+        last_act: name defining the last activation layer
     """
 
     def __init__(
@@ -83,19 +94,6 @@ def __init__(
         bias: bool = True,
         last_act=Act.SIGMOID,
     ) -> None:
-        """
-        Args:
-            in_shape: tuple of integers stating the dimension of the input tensor (minus batch dimension)
-            channels: tuple of integers stating the output channels of each convolutional layer
-            strides: tuple of integers stating the stride (downscale factor) of each convolutional layer
-            kernel_size: integer or tuple of integers stating size of convolutional kernels
-            num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
-            act: name or type defining activation layers
-            norm: name or type defining normalization layers
-            dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
-            bias: boolean stating if convolution layers should have a bias component
-            last_act: name defining the last activation layer
-        """
         super().__init__(in_shape, 1, channels, strides, kernel_size, num_res_units, act, norm, dropout, bias, last_act)
 
 
@@ -104,6 +102,17 @@ class Critic(Classifier):
     Defines a critic network from Classifier with a single output value and no final activation. The final layer is
     `nn.Flatten` instead of `nn.Linear`, the final result is computed as the mean over the first dimension. This is
     meant to be used with Wasserstein GANs.
+
+    Args:
+        in_shape: tuple of integers stating the dimension of the input tensor (minus batch dimension)
+        channels: tuple of integers stating the output channels of each convolutional layer
+        strides: tuple of integers stating the stride (downscale factor) of each convolutional layer
+        kernel_size: integer or tuple of integers stating size of convolutional kernels
+        num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
+        act: name or type defining activation layers
+        norm: name or type defining normalization layers
+        dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
+        bias: boolean stating if convolution layers should have a bias component
     """
 
     def __init__(
@@ -118,18 +127,6 @@ def __init__(
         dropout: Optional[float] = 0.25,
         bias: bool = True,
     ) -> None:
-        """
-        Args:
-            in_shape: tuple of integers stating the dimension of the input tensor (minus batch dimension)
-            channels: tuple of integers stating the output channels of each convolutional layer
-            strides: tuple of integers stating the stride (downscale factor) of each convolutional layer
-            kernel_size: integer or tuple of integers stating size of convolutional kernels
-            num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
-            act: name or type defining activation layers
-            norm: name or type defining normalization layers
-            dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
-            bias: boolean stating if convolution layers should have a bias component
-        """
         super().__init__(in_shape, 1, channels, strides, kernel_size, num_res_units, act, norm, dropout, bias, None)
 
     def _get_final_layer(self, in_shape: Sequence[int]):
diff --git a/monai/networks/nets/fullyconnectednet.py b/monai/networks/nets/fullyconnectednet.py
index b906bab015..19197bd58d 100644
--- a/monai/networks/nets/fullyconnectednet.py
+++ b/monai/networks/nets/fullyconnectednet.py
@@ -30,9 +30,24 @@ def _get_adn_layer(
 
 class FullyConnectedNet(nn.Sequential):
     """
-    Plain full-connected layer neural network
+    Simple full-connected layer neural network composed of a sequence of linear layers with PReLU activation and
+    dropout.  The network accepts input with `in_channels` channels, has output with `out_channels` channels, and
+    hidden layer output channels given in `hidden_channels`. If `bias` is True then linear units have a bias term.
+
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        hidden_channels: number of output channels for each hidden layer.
+        dropout: dropout ratio. Defaults to no dropout.
+        act: activation type and arguments. Defaults to PReLU.
+        bias: whether to have a bias term in linear units. Defaults to True.
+        adn_ordering: order of operations in :py:class:`monai.networks.blocks.ADN`.
+
+    Examples::
+
+        # accepts 4 values and infers 3 values as output, has 3 hidden layers with 10, 20, 10 values as output
+        net = FullyConnectedNet(4, 3, [10, 20, 10], dropout=0.2)
 
-    The network uses dropout and, by default, PReLU activation
     """
 
     def __init__(
@@ -53,8 +68,11 @@ def __init__(
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.hidden_channels = list(hidden_channels)
+        self.act = act
+        self.dropout = dropout
+        self.adn_ordering = adn_ordering
+
         self.add_module("flatten", nn.Flatten())
-        self.adn_layer = _get_adn_layer(act, dropout, adn_ordering)
 
         prev_channels = self.in_channels
         for i, c in enumerate(hidden_channels):
@@ -64,13 +82,34 @@ def __init__(
         self.add_module("output", nn.Linear(prev_channels, out_channels, bias))
 
     def _get_layer(self, in_channels: int, out_channels: int, bias: bool) -> nn.Sequential:
-        seq = nn.Sequential(nn.Linear(in_channels, out_channels, bias))
-        seq.add_module("ADN", self.adn_layer)
+        seq = nn.Sequential(
+            nn.Linear(in_channels, out_channels, bias), _get_adn_layer(self.act, self.dropout, self.adn_ordering)
+        )
         return seq
 
 
 class VarFullyConnectedNet(nn.Module):
-    """Variational fully-connected network."""
+    """
+    Variational fully-connected network. This is composed of an encode layer, reparameterization layer, and then a
+    decode layer.
+
+    Args:
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        latent_size: number of latent variables to use.
+        encode_channels: number of output channels for each hidden layer of the encode half.
+        decode_channels: number of output channels for each hidden layer of the decode half.
+        dropout: dropout ratio. Defaults to no dropout.
+        act: activation type and arguments. Defaults to PReLU.
+        bias: whether to have a bias term in linear units. Defaults to True.
+        adn_ordering: order of operations in :py:class:`monai.networks.blocks.ADN`.
+
+    Examples::
+
+        # accepts inputs with 4 values, uses a latent space of 2 variables, and produces outputs of 3 values
+        net = VarFullyConnectedNet(4, 3, 2, [5, 10], [10, 5])
+
+    """
 
     def __init__(
         self,
diff --git a/monai/networks/nets/generator.py b/monai/networks/nets/generator.py
index ea05787173..90aa26cd01 100644
--- a/monai/networks/nets/generator.py
+++ b/monai/networks/nets/generator.py
@@ -25,13 +25,35 @@ class Generator(nn.Module):
     """
     Defines a simple generator network accepting a latent vector and through a sequence of convolution layers
     constructs an output tensor of greater size and high dimensionality. The method `_get_layer` is used to
-    create each of these layers, override this method to define layers beyond the default Convolution or
-    ResidualUnit layers.
+    create each of these layers, override this method to define layers beyond the default
+    :py:class:`monai.networks.blocks.Convolution` or :py:class:`monai.networks.blocks.ResidualUnit` layers.
+
+    The layers are constructed using the values in the `channels` and `strides` arguments, the number being defined by
+    the length of these (which must match). Input is first passed through a :py:class:`torch.nn.Linear` layer to
+    convert the input vector to an image tensor with dimensions `start_shape`. This passes through the convolution
+    layers and is progressively upsampled if the `strides` valus are greater than 1 using transpose convolutions. The
+    size of the final output is defined by the `start_shape` dimension and the amount of upsampling done through
+    strides. In the default definition the size of the output's spatial dimensions will be that of `start_shape`
+    multiplied by the product of `strides`, thus the example network below upsamples an starting size of (64, 8, 8)
+    to (1, 64, 64) since its `strides` are (2, 2, 2).
+
+    Args:
+        latent_shape: tuple of integers stating the dimension of the input latent vector (minus batch dimension)
+        start_shape: tuple of integers stating the dimension of the tensor to pass to convolution subnetwork
+        channels: tuple of integers stating the output channels of each convolutional layer
+        strides: tuple of integers stating the stride (upscale factor) of each convolutional layer
+        kernel_size: integer or tuple of integers stating size of convolutional kernels
+        num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
+        act: name or type defining activation layers
+        norm: name or type defining normalization layers
+        dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
+        bias: boolean stating if convolution layers should have a bias component
+
+    Examples::
+
+        # 3 layers, latent input vector of shape (42, 24), output volume of shape (1, 64, 64)
+        net = Generator((42, 24), (64, 8, 8), (32, 16, 1), (2, 2, 2))
 
-    For example, a generator accepting a latent vector if shape (42,24) and producing an output volume of
-    shape (1,64,64) can be constructed as:
-
-        gen = Generator((42, 24), (64, 8, 8), (32, 16, 1), (2, 2, 2))
     """
 
     def __init__(
@@ -47,26 +69,6 @@ def __init__(
         dropout: Optional[float] = None,
         bias: bool = True,
     ) -> None:
-        """
-        Construct the generator network with the number of layers defined by `channels` and `strides`. In the
-        forward pass a `nn.Linear` layer relates the input latent vector to a tensor of dimensions `start_shape`,
-        this is then fed forward through the sequence of convolutional layers. The number of layers is defined by
-        the length of `channels` and `strides` which must match, each layer having the number of output channels
-        given in `channels` and an upsample factor given in `strides` (ie. a transpose convolution with that stride
-        size).
-
-        Args:
-            latent_shape: tuple of integers stating the dimension of the input latent vector (minus batch dimension)
-            start_shape: tuple of integers stating the dimension of the tensor to pass to convolution subnetwork
-            channels: tuple of integers stating the output channels of each convolutional layer
-            strides: tuple of integers stating the stride (upscale factor) of each convolutional layer
-            kernel_size: integer or tuple of integers stating size of convolutional kernels
-            num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
-            act: name or type defining activation layers
-            norm: name or type defining normalization layers
-            dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
-            bias: boolean stating if convolution layers should have a bias component
-        """
         super().__init__()
 
         self.in_channels, *self.start_shape = ensure_tuple(start_shape)
diff --git a/monai/networks/nets/regressor.py b/monai/networks/nets/regressor.py
index 0153014902..bc8feb7527 100644
--- a/monai/networks/nets/regressor.py
+++ b/monai/networks/nets/regressor.py
@@ -29,6 +29,30 @@ class Regressor(nn.Module):
     This defines a network for relating large-sized input tensors to small output tensors, ie. regressing large
     values to a prediction. An output of a single dimension can be used as value regression or multi-label
     classification prediction, an output of a single value can be used as a discriminator or critic prediction.
+
+    The network is constructed as a sequence of layers, either :py:class:`monai.networks.blocks.Convolution` or
+    :py:class:`monai.networks.blocks.ResidualUnit`, with a final fully-connected layer resizing the output from the
+    blocks to the final size. Each block is defined with a stride value typically used to downsample the input using
+    strided convolutions. In this way each block progressively condenses information from the input into a deep
+    representation the final fully-connected layer relates to a final result.
+
+    Args:
+        in_shape: tuple of integers stating the dimension of the input tensor (minus batch dimension)
+        out_shape: tuple of integers stating the dimension of the final output tensor (minus batch dimension)
+        channels: tuple of integers stating the output channels of each convolutional layer
+        strides: tuple of integers stating the stride (downscale factor) of each convolutional layer
+        kernel_size: integer or tuple of integers stating size of convolutional kernels
+        num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
+        act: name or type defining activation layers
+        norm: name or type defining normalization layers
+        dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
+        bias: boolean stating if convolution layers should have a bias component
+
+    Examples::
+
+        # infers a 2-value result (eg. a 2D cartesian coordinate) from a 64x64 image
+        net = Regressor((1, 64, 64), (2,), (2, 4, 8), (2, 2, 2))
+
     """
 
     def __init__(
@@ -44,23 +68,6 @@ def __init__(
         dropout: Optional[float] = None,
         bias: bool = True,
     ) -> None:
-        """
-        Construct the regressor network with the number of layers defined by `channels` and `strides`. Inputs are
-        first passed through the convolutional layers in the forward pass, the output from this is then pass
-        through a fully connected layer to relate them to the final output tensor.
-
-        Args:
-            in_shape: tuple of integers stating the dimension of the input tensor (minus batch dimension)
-            out_shape: tuple of integers stating the dimension of the final output tensor
-            channels: tuple of integers stating the output channels of each convolutional layer
-            strides: tuple of integers stating the stride (downscale factor) of each convolutional layer
-            kernel_size: integer or tuple of integers stating size of convolutional kernels
-            num_res_units: integer stating number of convolutions in residual units, 0 means no residual units
-            act: name or type defining activation layers
-            norm: name or type defining normalization layers
-            dropout: optional float value in range [0, 1] stating dropout probability for layers, None for no dropout
-            bias: boolean stating if convolution layers should have a bias component
-        """
         super().__init__()
 
         self.in_channels, *self.in_shape = ensure_tuple(in_shape)
diff --git a/monai/networks/nets/unet.py b/monai/networks/nets/unet.py
index 1dd52455d9..7d5f979330 100644
--- a/monai/networks/nets/unet.py
+++ b/monai/networks/nets/unet.py
@@ -26,6 +26,85 @@
 @export("monai.networks.nets")
 @alias("Unet")
 class UNet(nn.Module):
+    """
+    Enhanced version of UNet which has residual units implemented with the ResidualUnit class.
+    The residual part uses a convolution to change the input dimensions to match the output dimensions
+    if this is necessary but will use nn.Identity if not.
+    Refer to: https://link.springer.com/chapter/10.1007/978-3-030-12029-0_40.
+
+    Each layer of the network has a encode and decode path with a skip connection between them. Data in the encode path
+    is downsampled using strided convolutions (if `strides` is given values greater than 1) and in the decode path
+    upsampled using strided transpose convolutions. These down or up sampling operations occur at the beginning of each
+    block rather than afterwards as is typical in UNet implementations.
+
+    To further explain this consider the first example network given below. This network has 3 layers with strides
+    of 2 for each of the middle layers (the last layer is the bottom connection which does not down/up sample). Input
+    data to this network is immediately reduced in the spatial dimensions by a factor of 2 by the first convolution of
+    the residual unit defining the first layer of the encode part. The last layer of the decode part will upsample its
+    input (data from the previous layer concatenated with data from the skip connection) in the first convolution. this
+    ensures the final output of the network has the same shape as the input.
+
+    Padding values for the convolutions are chosen to ensure output sizes are even divisors/multiples of the input
+    sizes if the `strides` value for a layer is a factor of the input sizes. A typical case is to use `strides` values
+    of 2 and inputs that are multiples of powers of 2. An input can thus be downsampled evenly however many times its
+    dimensions can be divided by 2, so for the example network inputs would have to have dimensions that are mutliples
+    of 4. In the second example network given below the input to the bottom layer will have shape (1, 64, 15, 15) for
+    an input of shape (1, 1, 240, 240) demonstrating the input being reduced in size spatially by 2**4.
+
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        channels: sequence of channels. Top block first. The length of `channels` should be no less than 2.
+        strides: sequence of convolution strides. The length of `stride` should equal to `len(channels) - 1`.
+        kernel_size: convolution kernel size, the value(s) should be odd. If sequence,
+            its length should equal to dimensions. Defaults to 3.
+        up_kernel_size: upsampling convolution kernel size, the value(s) should be odd. If sequence,
+            its length should equal to dimensions. Defaults to 3.
+        num_res_units: number of residual units. Defaults to 0.
+        act: activation type and arguments. Defaults to PReLU.
+        norm: feature normalization type and arguments. Defaults to instance norm.
+        dropout: dropout ratio. Defaults to no dropout.
+        bias: whether to have a bias term in convolution blocks. Defaults to True.
+            According to `Performance Tuning Guide <https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html>`_,
+            if a conv layer is directly followed by a batch norm layer, bias should be False.
+
+    Examples::
+
+        from monai.networks.nets import UNet
+
+        # 3 layer network with down/upsampling by a factor of 2 at each layer with 2-convolution residual units
+        net = UNet(
+            spatial_dims=2,
+            in_channels=1,
+            out_channels=1,
+            channels=(4, 8, 16),
+            strides=(2, 2),
+            num_res_units=2
+        )
+
+        # 5 layer network with simple convolution/normalization/dropout/activation blocks defining the layers
+        net=UNet(
+            spatial_dims=2,
+            in_channels=1,
+            out_channels=1,
+            channels=(4, 8, 16, 32, 64),
+            strides=(2, 2, 2, 2),
+        )
+
+    .. deprecated:: 0.6.0
+        ``dimensions`` is deprecated, use ``spatial_dims`` instead.
+
+    Note: The acceptable spatial size of input data depends on the parameters of the network,
+        to set appropriate spatial size, please check the tutorial for more details:
+        https://github.com/Project-MONAI/tutorials/blob/master/modules/UNet_input_size_constrains.ipynb.
+        Typically, when using a stride of 2 in down / up sampling, the output dimensions are either half of the
+        input when downsampling, or twice when upsampling. In this case with N numbers of layers in the network,
+        the inputs must have spatial dimensions that are all multiples of 2^N.
+        Usually, applying `resize`, `pad` or `crop` transforms can help adjust the spatial size of input data.
+
+    """
+
     @deprecated_arg(
         name="dimensions", new_name="spatial_dims", since="0.6", msg_suffix="Please use `spatial_dims` instead."
     )
@@ -45,42 +124,7 @@ def __init__(
         bias: bool = True,
         dimensions: Optional[int] = None,
     ) -> None:
-        """
-        Enhanced version of UNet which has residual units implemented with the ResidualUnit class.
-        The residual part uses a convolution to change the input dimensions to match the output dimensions
-        if this is necessary but will use nn.Identity if not.
-        Refer to: https://link.springer.com/chapter/10.1007/978-3-030-12029-0_40.
 
-        Args:
-            spatial_dims: number of spatial dimensions.
-            in_channels: number of input channels.
-            out_channels: number of output channels.
-            channels: sequence of channels. Top block first. The length of `channels` should be no less than 2.
-            strides: sequence of convolution strides. The length of `stride` should equal to `len(channels) - 1`.
-            kernel_size: convolution kernel size, the value(s) should be odd. If sequence,
-                its length should equal to dimensions. Defaults to 3.
-            up_kernel_size: upsampling convolution kernel size, the value(s) should be odd. If sequence,
-                its length should equal to dimensions. Defaults to 3.
-            num_res_units: number of residual units. Defaults to 0.
-            act: activation type and arguments. Defaults to PReLU.
-            norm: feature normalization type and arguments. Defaults to instance norm.
-            dropout: dropout ratio. Defaults to no dropout.
-            bias: whether to have a bias term in convolution blocks. Defaults to True.
-                According to `Performance Tuning Guide <https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html>`_,
-                if a conv layer is directly followed by a batch norm layer, bias should be False.
-
-        .. deprecated:: 0.6.0
-            ``dimensions`` is deprecated, use ``spatial_dims`` instead.
-
-        Note: The acceptable spatial size of input data depends on the parameters of the network,
-            to set appropriate spatial size, please check the tutorial for more details:
-            https://github.com/Project-MONAI/tutorials/blob/master/modules/UNet_input_size_constrains.ipynb.
-            Typically, when using a stride of 2 in down / up sampling, the output dimensions are either half of the
-            input when downsampling, or twice when upsampling. In this case with N numbers of layers in the network,
-            the inputs must have spatial dimensions that are all multiples of 2^N.
-            Usually, applying `resize`, `pad` or `crop` transforms can help adjust the spatial size of input data.
-
-        """
         super().__init__()
 
         if len(channels) < 2:
diff --git a/monai/networks/nets/varautoencoder.py b/monai/networks/nets/varautoencoder.py
index a228efab07..b4ef8be93d 100644
--- a/monai/networks/nets/varautoencoder.py
+++ b/monai/networks/nets/varautoencoder.py
@@ -28,10 +28,36 @@ class VarAutoEncoder(AutoEncoder):
     """
     Variational Autoencoder based on the paper - https://arxiv.org/abs/1312.6114
 
-    .. code-block:: python
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_shape: shape of input data starting with channel dimension.
+        out_channels: number of output channels.
+        latent_size: size of the latent variable.
+        channels: sequence of channels. Top block first. The length of `channels` should be no less than 2.
+        strides: sequence of convolution strides. The length of `stride` should equal to `len(channels) - 1`.
+        kernel_size: convolution kernel size, the value(s) should be odd. If sequence,
+            its length should equal to dimensions. Defaults to 3.
+        up_kernel_size: upsampling convolution kernel size, the value(s) should be odd. If sequence,
+            its length should equal to dimensions. Defaults to 3.
+        num_res_units: number of residual units. Defaults to 0.
+        inter_channels: sequence of channels defining the blocks in the intermediate layer between encode and decode.
+        inter_dilations: defines the dilation value for each block of the intermediate layer. Defaults to 1.
+        num_inter_units: number of residual units for each block of the intermediate layer. Defaults to 0.
+        act: activation type and arguments. Defaults to PReLU.
+        norm: feature normalization type and arguments. Defaults to instance norm.
+        dropout: dropout ratio. Defaults to no dropout.
+        bias: whether to have a bias term in convolution blocks. Defaults to True.
+            According to `Performance Tuning Guide <https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html>`_,
+            if a conv layer is directly followed by a batch norm layer, bias should be False.
+
+    .. deprecated:: 0.6.0
+        ``dimensions`` is deprecated, use ``spatial_dims`` instead.
+
+    Examples::
 
         from monai.networks.nets import VarAutoEncoder
 
+        # 3 layer network accepting images with dimensions (1, 32, 32) and using a latent vector with 2 values
         model = VarAutoEncoder(
             dimensions=2,
             in_shape=(32, 32),  # image spatial shape
@@ -44,10 +70,6 @@ class VarAutoEncoder(AutoEncoder):
     see also:
         - Variational autoencoder network with MedNIST Dataset
           https://github.com/Project-MONAI/tutorials/blob/master/modules/varautoencoder_mednist.ipynb
-
-    .. deprecated:: 0.6.0
-        ``dimensions`` is deprecated, use ``spatial_dims`` instead.
-
     """
 
     @deprecated_arg(