From 8994d4714b188f996a9a6dfe61b53d43269e07fc Mon Sep 17 00:00:00 2001
From: Todd Thomson <todd.thomson@me.com>
Date: Sat, 22 Jan 2022 20:06:18 -0800
Subject: [PATCH] Progress towards beta release #1

---
 Mila/Mila.nuspec                              |  1 +
 Mila/Mila.vcxproj                             | 14 ++--
 Mila/Mila.vcxproj.filters                     | 17 ++---
 Mila/{Mila.Core.ixx => Source/Core/Core.ixx}  |  2 +-
 .../Dnn/CuDNN/{MilaCuDnn.ixx => CuDnn.ixx}    |  2 +-
 Mila/Source/Dnn/Data/Data.ixx                 | 27 ++++++++
 ...oder.ixx => DataCategoryVectorEncoder.ixx} |  6 +-
 ...taset.ixx => DataCharDatasetGenerator.ixx} | 44 ++++++------
 ...{DataDatasetLoader.ixx => DataDataset.ixx} | 68 ++++++++++---------
 .../Dnn/Data/H5/DataH5DatasetWriter.ixx       | 16 +----
 Mila/Source/Dnn/Dnn.ixx                       |  8 +++
 Mila/Source/Dnn/MilaDnn.ixx                   |  3 -
 Mila/Source/Mila.ixx                          |  6 +-
 MilaTest/Dataset/Encoding.cpp                 | 21 ++++++
 MilaTest/Dataset/Generator.cpp                | 26 +++++++
 MilaTest/Dataset/Loader.cpp                   | 45 +++---------
 MilaTest/MilaTest.vcxproj                     |  2 +
 MilaTest/MilaTest.vcxproj.filters             |  2 +
 Samples/MilaPackage/MilaPackage.vcxproj       |  4 +-
 Samples/MilaPackage/packages.config           |  2 +-
 20 files changed, 186 insertions(+), 130 deletions(-)
 rename Mila/{Mila.Core.ixx => Source/Core/Core.ixx} (95%)
 rename Mila/Source/Dnn/CuDNN/{MilaCuDnn.ixx => CuDnn.ixx} (96%)
 create mode 100644 Mila/Source/Dnn/Data/Data.ixx
 rename Mila/Source/Dnn/Data/{DataCategoryToVectorEncoder.ixx => DataCategoryVectorEncoder.ixx} (94%)
 rename Mila/Source/Dnn/Data/{DataTextToDataset.ixx => DataCharDatasetGenerator.ixx} (84%)
 rename Mila/Source/Dnn/Data/{DataDatasetLoader.ixx => DataDataset.ixx} (74%)
 create mode 100644 Mila/Source/Dnn/Dnn.ixx
 delete mode 100644 Mila/Source/Dnn/MilaDnn.ixx
 create mode 100644 MilaTest/Dataset/Encoding.cpp
 create mode 100644 MilaTest/Dataset/Generator.cpp
diff --git a/Mila/Mila.nuspec b/Mila/Mila.nuspec
index d41d5d6..be1b302 100644
--- a/Mila/Mila.nuspec
+++ b/Mila/Mila.nuspec
@@ -19,6 +19,7 @@
 	<files>
 		<file src="build\x64\Debug\Mila.lib" target="build\native\x64\Debug" />
 		<file src="build\x64\Debug\Mila.pdb" target="build\native\x64\Debug" />
+		<!-- file src="build\x64\Debug\Mila.xml" target="build\native\x64\Debug" / -->
 		<file src="build\x64\Debug\*.ifc" target="build\native\x64\Debug" />
 		<file src="..\icon.png" target="images\" />
 		<!-- .targets -->
diff --git a/Mila/Mila.vcxproj b/Mila/Mila.vcxproj
index 3021f7c..00668c8 100644
--- a/Mila/Mila.vcxproj
+++ b/Mila/Mila.vcxproj
@@ -17,7 +17,7 @@
     <CudaCompile Include="Source\Dnn\CuDNN\init_data.cu" />
   </ItemGroup>
   <ItemGroup>
-    <ClCompile Include="Mila.Core.ixx" />
+    <ClCompile Include="Source\Core\Core.ixx" />
     <ClCompile Include="Source\Core\Core.Version.ixx" />
     <ClCompile Include="Source\Dnn\Cuda\Cuda.ixx" />
     <ClCompile Include="Source\Dnn\Cuda\CudaDevice.ixx" />
@@ -29,7 +29,7 @@
     <ClCompile Include="Source\Dnn\Cuda\CudaProfiler.ixx" />
     <ClCompile Include="Source\Dnn\Cuda\CudaStream.ixx" />
     <ClCompile Include="Source\Dnn\Cuda\CudaUniqueHandle.ixx" />
-    <ClCompile Include="Source\Dnn\CuDNN\MilaCuDnn.ixx" />
+    <ClCompile Include="Source\Dnn\CuDNN\CuDnn.ixx" />
     <ClCompile Include="Source\Dnn\CuDNN\CuDnnContext.ixx" />
     <ClCompile Include="Source\Dnn\CuDNN\CuDnnDescriptor.ixx" />
     <ClCompile Include="Source\Dnn\CuDNN\CuDnnError.ixx" />
@@ -40,12 +40,14 @@
     <ClCompile Include="Source\Dnn\CuDNN\CuDnnOpaqueHandle.ixx" />
     <ClCompile Include="Source\Dnn\CuDNN\CuDnnStatus.ixx" />
     <ClCompile Include="Source\Dnn\CuDNN\CuDnnUtils.ixx" />
+    <ClCompile Include="Source\Dnn\Data\Data.ixx" />
+    <ClCompile Include="Source\Dnn\Data\DataDataset.ixx" />
     <ClCompile Include="Source\Dnn\Data\H5\DataH5DataTypeMapper.ixx" />
     <ClCompile Include="Source\Dnn\DnnModel.ixx" />
     <ClCompile Include="Source\Dnn\DnnModelBuilder.ixx" />
     <ClCompile Include="Source\Dnn\DnnModelOptions.ixx" />
     <ClCompile Include="Source\Dnn\DnnDropoutDescriptor.ixx" />
-    <ClCompile Include="Source\Dnn\MilaDnn.ixx" />
+    <ClCompile Include="Source\Dnn\Dnn.ixx" />
     <ClCompile Include="Source\Dnn\DnnNeuralNetType.ixx" />
     <ClCompile Include="Source\Dnn\DnnRnnDataSetDescriptor.ixx" />
     <ClCompile Include="Source\Dnn\DnnRnnLayerCollection.ixx" />
@@ -55,13 +57,12 @@
     <ClCompile Include="Source\Dnn\DnnRnnOpDescriptor.ixx" />
     <ClCompile Include="Source\Dnn\DnnStateTensorDescriptor.ixx" />
     <ClCompile Include="Source\Dnn\DnnTensorDescriptor.ixx" />
-    <ClCompile Include="Source\Dnn\Data\DataDatasetLoader.ixx" />
     <ClCompile Include="Source\Dnn\Data\DataDatasetType.ixx" />
     <ClCompile Include="Source\Dnn\Data\FileStream.ixx" />
     <ClCompile Include="Source\Dnn\Data\H5\DataH5DatasetReader.ixx" />
     <ClCompile Include="Source\Dnn\Data\H5\DataH5DatasetWriter.ixx" />
-    <ClCompile Include="Source\Dnn\Data\DataCategoryToVectorEncoder.ixx" />
-    <ClCompile Include="Source\Dnn\Data\DataTextToDataset.ixx" />
+    <ClCompile Include="Source\Dnn\Data\DataCategoryVectorEncoder.ixx" />
+    <ClCompile Include="Source\Dnn\Data\DataCharDatasetGenerator.ixx" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Source\Dnn\CuDNN\fp16_emu.h" />
@@ -129,6 +130,7 @@
       <PreprocessorDefinitions>_DEBUG;_LIB;%(PreprocessorDefinitions);WIN32;_WINDOWS</PreprocessorDefinitions>
       <ObjectFileName>$(IntDir)</ObjectFileName>
       <ModuleOutputFile>$(IntDir)%(FileName).ifc</ModuleOutputFile>
+      <GenerateXMLDocumentationFiles>false</GenerateXMLDocumentationFiles>
     </ClCompile>
     <ResourceCompile>
       <PreprocessorDefinitions>%(PreprocessorDefinitions);WIN32;_DEBUG;_WINDOWS;CMAKE_INTDIR=\"Debug\"</PreprocessorDefinitions>
diff --git a/Mila/Mila.vcxproj.filters b/Mila/Mila.vcxproj.filters
index 96c648c..23e9f38 100644
--- a/Mila/Mila.vcxproj.filters
+++ b/Mila/Mila.vcxproj.filters
@@ -29,7 +29,7 @@
     <ClCompile Include="Source\Dnn\CuDNN\fp16_emu.cpp">
       <Filter>Dnn\CuDnn</Filter>
     </ClCompile>
-    <ClCompile Include="Source\Dnn\CuDNN\MilaCuDnn.ixx">
+    <ClCompile Include="Source\Dnn\CuDNN\CuDnn.ixx">
       <Filter>Dnn\CuDnn</Filter>
     </ClCompile>
     <ClCompile Include="Source\Dnn\CuDNN\CuDnnContext.ixx">
@@ -101,7 +101,7 @@
     <ClCompile Include="Source\Dnn\DnnDropoutDescriptor.ixx">
       <Filter>Dnn</Filter>
     </ClCompile>
-    <ClCompile Include="Source\Dnn\MilaDnn.ixx">
+    <ClCompile Include="Source\Dnn\Dnn.ixx">
       <Filter>Dnn</Filter>
     </ClCompile>
     <ClCompile Include="Source\Dnn\DnnNeuralNetType.ixx">
@@ -137,31 +137,32 @@
     <ClCompile Include="Source\Dnn\Data\H5\DataH5DatasetWriter.ixx">
       <Filter>Dnn\Data\H5</Filter>
     </ClCompile>
-    <ClCompile Include="Source\Dnn\Data\DataDatasetLoader.ixx">
-      <Filter>Dnn\Data</Filter>
-    </ClCompile>
     <ClCompile Include="Source\Dnn\Data\DataDatasetType.ixx">
       <Filter>Dnn\Data</Filter>
     </ClCompile>
     <ClCompile Include="Source\Dnn\Data\FileStream.ixx">
       <Filter>Dnn\Data</Filter>
     </ClCompile>
-    <ClCompile Include="Source\Dnn\Data\DataCategoryToVectorEncoder.ixx">
+    <ClCompile Include="Source\Dnn\Data\DataCategoryVectorEncoder.ixx">
       <Filter>Dnn\Data</Filter>
     </ClCompile>
-    <ClCompile Include="Source\Dnn\Data\DataTextToDataset.ixx">
+    <ClCompile Include="Source\Dnn\Data\DataCharDatasetGenerator.ixx">
       <Filter>Dnn\Data</Filter>
     </ClCompile>
     <ClCompile Include="Source\Core\Core.Version.ixx">
       <Filter>Core</Filter>
     </ClCompile>
-    <ClCompile Include="Mila.Core.ixx">
+    <ClCompile Include="Source\Core\Core.ixx">
       <Filter>Core</Filter>
     </ClCompile>
     <ClCompile Include="Source\Mila.ixx" />
     <ClCompile Include="Source\Dnn\Data\H5\DataH5DataTypeMapper.ixx">
       <Filter>Dnn\Data\H5</Filter>
     </ClCompile>
+    <ClCompile Include="Source\Dnn\Data\DataDataset.ixx">
+      <Filter>Dnn\Data</Filter>
+    </ClCompile>
+    <ClCompile Include="Source\Dnn\Data\Data.ixx" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Source\Dnn\CuDNN\fp16_emu.h">
diff --git a/Mila/Mila.Core.ixx b/Mila/Source/Core/Core.ixx
similarity index 95%
rename from Mila/Mila.Core.ixx
rename to Mila/Source/Core/Core.ixx
index 3141865..6240a6b 100644
--- a/Mila/Mila.Core.ixx
+++ b/Mila/Source/Core/Core.ixx
@@ -19,6 +19,6 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-export module Mila.Core;
+export module Core;
 
 export import Core.Version;
\ No newline at end of file
diff --git a/Mila/Source/Dnn/CuDNN/MilaCuDnn.ixx b/Mila/Source/Dnn/CuDNN/CuDnn.ixx
similarity index 96%
rename from Mila/Source/Dnn/CuDNN/MilaCuDnn.ixx
rename to Mila/Source/Dnn/CuDNN/CuDnn.ixx
index 390fe5d..f041e41 100644
--- a/Mila/Source/Dnn/CuDNN/MilaCuDnn.ixx
+++ b/Mila/Source/Dnn/CuDNN/CuDnn.ixx
@@ -19,7 +19,7 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-export module Mila.Cudnn;
+export module Cudnn;
 
 export import CuDnn.Context;
 export import CuDnn.Descriptor;
diff --git a/Mila/Source/Dnn/Data/Data.ixx b/Mila/Source/Dnn/Data/Data.ixx
new file mode 100644
index 0000000..0d66912
--- /dev/null
+++ b/Mila/Source/Dnn/Data/Data.ixx
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2021 Todd Thomson, Achilles Software.  All rights reserved.
+ *
+ * Please refer to the Mila end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+export module Data;
+
+export import Data.Dataset;
+export import Data.DatasetType;
+export import Data.CharDatasetGenerator;
+export import Data.CategoryVectorEncoder;
diff --git a/Mila/Source/Dnn/Data/DataCategoryToVectorEncoder.ixx b/Mila/Source/Dnn/Data/DataCategoryVectorEncoder.ixx
similarity index 94%
rename from Mila/Source/Dnn/Data/DataCategoryToVectorEncoder.ixx
rename to Mila/Source/Dnn/Data/DataCategoryVectorEncoder.ixx
index 1fcdf59..26edf21 100644
--- a/Mila/Source/Dnn/Data/DataCategoryToVectorEncoder.ixx
+++ b/Mila/Source/Dnn/Data/DataCategoryVectorEncoder.ixx
@@ -25,7 +25,7 @@ module;
 #include <iterator>
 #include <vector>
 
-export module Data.CategoryToVectorEncoder;
+export module Data.CategoryVectorEncoder;
 
 namespace Mila::Dnn::Data
 {
@@ -33,7 +33,7 @@ namespace Mila::Dnn::Data
     /// A category to vector (One-hot) encoder
     /// </summary>
     export template <typename TElement>
-    class CategoryToVectorEncoder
+    class CategoryVectorEncoder
     {
     public:
 
@@ -42,7 +42,7 @@ namespace Mila::Dnn::Data
         /// </summary>
         /// <param name="k">Size of vector</param>
         /// <param name="value">Numeric value to use for marking category</param>
-        CategoryToVectorEncoder( size_t k, TElement value )
+        CategoryVectorEncoder( size_t k, TElement value )
             : k_( k ), value_( value )
         {
             k_vector_ = std::vector<TElement>( k_, {} );
diff --git a/Mila/Source/Dnn/Data/DataTextToDataset.ixx b/Mila/Source/Dnn/Data/DataCharDatasetGenerator.ixx
similarity index 84%
rename from Mila/Source/Dnn/Data/DataTextToDataset.ixx
rename to Mila/Source/Dnn/Data/DataCharDatasetGenerator.ixx
index c036d61..8c1ed8d 100644
--- a/Mila/Source/Dnn/Data/DataTextToDataset.ixx
+++ b/Mila/Source/Dnn/Data/DataCharDatasetGenerator.ixx
@@ -49,8 +49,9 @@ module;
 #include <string>
 #include <map>
 
-export module Data.TextToDataset;
+export module Data.CharDatasetGenerator;
 
+import Data.DatasetType;
 import Data.H5DatasetWriter;
 
 namespace fs = std::filesystem;
@@ -61,18 +62,18 @@ namespace Mila::Dnn::Data
     /// <summary>
     /// A class to convert a text file to an H5 formatted dataset.
     /// </summary>
-    export class TextToDataset
+    export class CharDatasetGenerator
     {
     public:
 
         /// <summary>
         /// Creates a dataset from a text file with the specified test, validation
-        /// and training splits.
+        /// splits.
         /// </summary>
         /// <param name="file_path"></param>
         /// <param name="test_split"></param>
         /// <param name="validation_split"></param>
-        TextToDataset( const fs::path& file_path, float test_split = 0.1f, float validation_split = 0.1f )
+        CharDatasetGenerator( const fs::path& file_path, float test_split = 0.1f, float validation_split = 0.1f )
             : file_path_( file_path), test_split_( test_split ), validation_split_( validation_split )
         {
             if ( test_split + validation_split >= 1.0f )
@@ -87,9 +88,9 @@ namespace Mila::Dnn::Data
         }
             
         /// <summary>
-        /// 
+        /// Generates a dataset from the text file. 
         /// </summary>
-        void CreateDataset()
+        void GenerateDataset()
         {
             std::ifstream text_file( file_path_ );
 
@@ -99,8 +100,8 @@ namespace Mila::Dnn::Data
             }
 
             // First go through the file once to determine its size and to 
-            // build the vocabulary token map.
-            std::map<char, int> tokens = {};
+            // build the vocabulary map.
+            std::map<char, int> vocabulary = {};
             int total_chars = 0;
             int index = 1;
             std::string line;
@@ -111,19 +112,19 @@ namespace Mila::Dnn::Data
 
                 for ( char c : line )
                 {
-                    auto it = tokens.find( c );
-                    if ( it == tokens.end() )
+                    auto it = vocabulary.find( c );
+                    if ( it == vocabulary.end() )
                     {
-                        tokens.insert( {c, index++} );
+                        vocabulary.insert( {c, index++} );
                     }
                 }
             }
 
             std::cout << "File size: " << std::to_string( total_chars ) << std::endl;
-            std::cout << "Vocab size: " << std::to_string( tokens.size() ) << std::endl;
+            std::cout << "Vocab size: " << std::to_string( vocabulary.size() ) << std::endl;
 
-            vocabulary_size = tokens.size();
-            text_size = total_chars;
+            vocabulary_size_ = vocabulary.size();
+            text_size_ = total_chars;
 
             // Now we can figure out the split sizes
             int validation_size = validation_split_ * total_chars;
@@ -152,7 +153,7 @@ namespace Mila::Dnn::Data
 
                 for ( char& c : line )
                 {
-                    splits[ split_index ][ split_position++ ] = tokens[ c ];
+                    splits[ split_index ][ split_position++ ] = vocabulary[ c ];
 
                     if ( split_position >= splits[ split_index ].size() )
                     {
@@ -172,15 +173,14 @@ namespace Mila::Dnn::Data
             h5Writer.WriteDataset<char>( "validation_ds", splits[ VALIDATION_SET ] );
             h5Writer.WriteDataset<char>( "testing_ds", splits[ TESTING_SET ] );
             
-            // Convert the m
+            // Write the vocabulary map to a linear vector
             std::vector<int> vocabulary_vector;
-
-            for ( const auto& [key, value ] : tokens ) {
+            for ( const auto& [key, value ] : vocabulary ) {
                 vocabulary_vector.push_back( key );
                 vocabulary_vector.push_back( value );
             }
 
-            h5Writer.WriteDataset<int>( "vocabulary_ds", vocabulary_vector );
+            h5Writer.WriteDataset<int>( to_string( DatasetType::vocabulary ), vocabulary_vector );
         }
 
     private:
@@ -190,8 +190,10 @@ namespace Mila::Dnn::Data
         float validation_split_ = 0.1f;
         float test_split_ = 0.1f;
 
-        size_t vocabulary_size = 0;
-        size_t text_size = 0;
+        std::map<int, int> vocabulary_ = {};
+
+        size_t vocabulary_size_ = 0;
+        size_t text_size_ = 0;
 
         const int TRAINING_SET = 0;
         const int VALIDATION_SET = 1;
diff --git a/Mila/Source/Dnn/Data/DataDatasetLoader.ixx b/Mila/Source/Dnn/Data/DataDataset.ixx
similarity index 74%
rename from Mila/Source/Dnn/Data/DataDatasetLoader.ixx
rename to Mila/Source/Dnn/Data/DataDataset.ixx
index 8d49c79..38b6668 100644
--- a/Mila/Source/Dnn/Data/DataDatasetLoader.ixx
+++ b/Mila/Source/Dnn/Data/DataDataset.ixx
@@ -22,13 +22,14 @@
 module;
 #include <string>
 #include <filesystem>
+#include <stdexcept>
 #include <map>
 #include <vector>
 
-export module Data.DatasetLoader;
+export module Data.Dataset;
 
-import Data.H5DatasetReader;
 import Data.DatasetType;
+import Data.H5DatasetReader;
 
 namespace fs = std::filesystem;
 
@@ -39,34 +40,41 @@ namespace Mila::Dnn::Data
     /// </summary>
     export using XYPair = std::pair<std::vector<char>, std::vector<char>>;
 
-    /// <summary>
-    /// Character dataset loader.
-    /// </summary>
-    export class DatasetLoader
+    export class Dataset
     {
     public:
 
-        /// <summary>
-        /// DatasetLoader constructor
-        /// </summary>
-        /// <param name="datasetType"></param>
-        /// <param name="datasetPath"></param>
-        /// <param name="batchSize"></param>
-        /// <param name="sequenceLength"></param>
-        DatasetLoader( const DatasetType datasetType, fs::path datasetPath, int batchSize, int sequenceLength )
+        Dataset( const fs::path& datasetPath, int batchSize, int sequenceLength )
+            : path_( datasetPath ), batch_size_( batchSize ), sequence_length_( sequenceLength )
         {
-            if ( !std::filesystem::exists( datasetPath ) )
+            if ( !fs::exists( path_ ) )
             {
                 throw std::invalid_argument( "Dataset file does not exist." );
             }
 
-            dataset_path_ = datasetPath;
-            batch_size_ = batchSize;
-            sequence_length_ = sequenceLength;
+            block_size_ = batch_size_ * sequence_length_;
 
-            block_size_ = batchSize * sequenceLength;
+            ReadVocabulary();
+        }
+
+        /// <summary>
+        /// Loads the specified dataset type.
+        /// </summary>
+        /// <param name="datasetType"></param>
+        void Load( const DatasetType datasetType )
+        {
+            H5::H5DatasetReader ds_reader = H5::H5DatasetReader( path_.string() );
 
-            ReadDataset( datasetType );
+            ds_reader.ReadDataset<char>( to_string( datasetType ), dataset_ );
+
+            int dataset_size = dataset_.size();
+
+            max_blocks_ = (dataset_size - 1) / block_size_;
+        }
+
+        const std::map<int,int>& GetVocabulary()
+        {
+            return vocabulary_;
         }
 
         /// <summary>
@@ -111,23 +119,21 @@ namespace Mila::Dnn::Data
 
     private:
 
-        int ReadDataset( const DatasetType datasetType )
+        void ReadVocabulary()
         {
-            H5::H5DatasetReader ds_reader = H5::H5DatasetReader( dataset_path_.string() );
-            
-            //ds_reader.ReadDataset<int>( "vocabulary_ds", dataset_);
-            ds_reader.ReadDataset<char>( to_string( datasetType ), dataset_ );
+            H5::H5DatasetReader ds_reader = H5::H5DatasetReader( path_.string() );
 
-            int dataset_size = dataset_.size();
-             
-            max_blocks_ = (dataset_size - 1) / block_size_;
+            std::vector<int> vocabulary_vector;
+            ds_reader.ReadDataset<int>( to_string( DatasetType::vocabulary ), vocabulary_vector );
 
-            return 0;
+            for ( auto it = vocabulary_vector.begin(); it != vocabulary_vector.end(); it++ ) {
+                vocabulary_[ *it++ ] = *it;
+            }
         }
 
     private:
 
-        fs::path dataset_path_;
+        fs::path path_;
 
         int batch_size_;
         int sequence_length_;
@@ -138,7 +144,7 @@ namespace Mila::Dnn::Data
         int max_blocks_ = 0;
         int block_size_ = 0;
 
-        std::map<char, int> vocabulary_;
+        std::map<int, int> vocabulary_;
         std::vector<char> dataset_;
     };
 }
\ No newline at end of file
diff --git a/Mila/Source/Dnn/Data/H5/DataH5DatasetWriter.ixx b/Mila/Source/Dnn/Data/H5/DataH5DatasetWriter.ixx
index b8acbe2..18a04d2 100644
--- a/Mila/Source/Dnn/Data/H5/DataH5DatasetWriter.ixx
+++ b/Mila/Source/Dnn/Data/H5/DataH5DatasetWriter.ixx
@@ -36,18 +36,6 @@ using namespace H5;
 
 namespace Mila::Dnn::Data::H5
 {
-    /*template<typename TElement>
-    const PredType& get_data_type();
-
-    template<>
-    const PredType& get_data_type<float>() { return PredType::NATIVE_FLOAT; }
-
-    template<>
-    const PredType& get_data_type<int>() { return PredType::NATIVE_INT; }
-
-    template<>
-    const PredType& get_data_type<char>() { return PredType::NATIVE_CHAR; }*/
-
     export class H5DatasetWriter
     {
     public:
@@ -67,7 +55,7 @@ namespace Mila::Dnn::Data::H5
                 DataSpace dataspace( 1, splits_dimsf );
 
                 //PredType predType = get_data_type<TElement>();
-                IntType datatype( get_data_type<TElement>() ); // PredType::NATIVE_UCHAR ); 
+                IntType datatype( get_data_type<TElement>() );
                 datatype.setOrder( H5T_ORDER_LE );
 
                 /*
@@ -81,7 +69,7 @@ namespace Mila::Dnn::Data::H5
                  * Write the data to the dataset using default memory space, file
                  * space, and transfer properties.
                  */
-                dataset.write( data.data(), get_data_type<TElement>() ); // PredType::NATIVE_UCHAR );
+                dataset.write( data.data(), get_data_type<TElement>() );
 
             } // end of try block
 
diff --git a/Mila/Source/Dnn/Dnn.ixx b/Mila/Source/Dnn/Dnn.ixx
new file mode 100644
index 0000000..803ff65
--- /dev/null
+++ b/Mila/Source/Dnn/Dnn.ixx
@@ -0,0 +1,8 @@
+export module Dnn;
+
+export import Dnn.Model;
+export import Dnn.ModelBuilder;
+export import Dnn.ModelOptions;
+
+export import Dnn.RnnOpDescriptor;
+export import Dnn.DropoutDescriptor;
\ No newline at end of file
diff --git a/Mila/Source/Dnn/MilaDnn.ixx b/Mila/Source/Dnn/MilaDnn.ixx
deleted file mode 100644
index f24335a..0000000
--- a/Mila/Source/Dnn/MilaDnn.ixx
+++ /dev/null
@@ -1,3 +0,0 @@
-export module Mila.Dnn;
-
-export void MyFunc();
\ No newline at end of file
diff --git a/Mila/Source/Mila.ixx b/Mila/Source/Mila.ixx
index f797da9..708ef00 100644
--- a/Mila/Source/Mila.ixx
+++ b/Mila/Source/Mila.ixx
@@ -21,8 +21,8 @@
 
 export module Mila;
 
-export import Mila.Core;
-export import Mila.Dnn;
+export import Core;
+export import Dnn;
 
 //export import Mila.Cudnn;
 
@@ -32,7 +32,7 @@ export namespace Mila {
 	/// Gets the Mila API version.
 	/// </summary>
 	export Core::Version GetAPIVersion() {
-		return Core::Version{0, 9, 3};
+		return Core::Version{0, 9, 4};
 	}
 }
 
diff --git a/MilaTest/Dataset/Encoding.cpp b/MilaTest/Dataset/Encoding.cpp
new file mode 100644
index 0000000..9310f08
--- /dev/null
+++ b/MilaTest/Dataset/Encoding.cpp
@@ -0,0 +1,21 @@
+#include "gtest/gtest.h"
+#include <iostream>
+#include <vector>
+
+
+import Data.CategoryVectorEncoder;
+
+using namespace Mila::Dnn::Data;
+
+namespace Mila::Test::Data
+{
+    TEST( Dataset, CategoryVector )
+    {
+        std::vector<int> input = { 2, 3, 1 };
+        auto transformer = CategoryVectorEncoder<float>( 3, 1.0 );
+
+        auto output = transformer.Convert( input );
+        
+        EXPECT_TRUE( output.size() == 9);
+    };
+}
\ No newline at end of file
diff --git a/MilaTest/Dataset/Generator.cpp b/MilaTest/Dataset/Generator.cpp
new file mode 100644
index 0000000..7343f35
--- /dev/null
+++ b/MilaTest/Dataset/Generator.cpp
@@ -0,0 +1,26 @@
+#include "gtest/gtest.h"
+#include <iostream>
+#include <filesystem>
+
+using std::filesystem::current_path;
+
+import Data.CharDatasetGenerator;
+import Data.Dataset;
+import Data.DatasetType;
+
+using namespace Mila::Dnn::Data;
+
+namespace Mila::Test::Data
+{
+    TEST( Dataset, GenerateDataset )
+    {
+        std::cout << "Current working directory: " << current_path() << std::endl;
+
+        auto filepath = current_path();
+        filepath += "\\Data\\tiny-shakespeare.txt";
+
+        CharDatasetGenerator dataset_generator = CharDatasetGenerator( filepath );
+
+        dataset_generator.GenerateDataset();
+    };
+}
\ No newline at end of file
diff --git a/MilaTest/Dataset/Loader.cpp b/MilaTest/Dataset/Loader.cpp
index 94f8cf5..0d127ed 100644
--- a/MilaTest/Dataset/Loader.cpp
+++ b/MilaTest/Dataset/Loader.cpp
@@ -4,29 +4,14 @@
 
 using std::filesystem::current_path;
 
-import Data.TextToDataset;
-import Data.DatasetLoader;
-import Data.CategoryToVectorEncoder;
+import Data.Dataset;
+import Data.DatasetType;
 
 using namespace Mila::Dnn::Data;
 
 namespace Mila::Test::Data
 {
-    TEST( Dataset, TextToDataset )
-    {
-        std::cout << "Current working directory: " << current_path() << std::endl;
-
-        auto filepath = current_path();
-        filepath += "\\Data\\tiny-shakespeare.txt";
-
-        TextToDataset h5Converter = TextToDataset( filepath );
-
-        h5Converter.CreateDataset();
-
-        //EXPECT_TRUE( numErrors != 0 );
-    };
-
-    TEST( Dataset, SequenceLoader )
+    TEST( Dataset, Dataset_ReadsAllTrainingBlocks )
     {
         std::cout << "Current working directory: " << current_path() << std::endl;
 
@@ -36,30 +21,18 @@ namespace Mila::Test::Data
         int batch_size = 10;
         int sequence_length = 10;
 
-        DatasetLoader loader = DatasetLoader(
-            DatasetType::training, 
-            filepath,
-            batch_size, 
-            sequence_length );
+        Dataset dataset = Dataset( filepath, batch_size, sequence_length );
+
+        dataset.Load( DatasetType::training );
 
         int blocks_read = 0;
-        while (!loader.EndOfDataset())
+        while (!dataset.EndOfDataset())
         {
-            XYPair samples = loader.NextBlock();
+            XYPair samples = dataset.NextBlock();
 
             blocks_read++;
         }
 
-        EXPECT_EQ( blocks_read, loader.BlockCount() );
-    };
-
-    TEST( Dataset, CategoryVector )
-    {
-        std::vector<int> input = { 2, 3, 1 };
-        auto transformer = CategoryToVectorEncoder<float>( 3, 1.0 );
-
-        auto output = transformer.Convert( input );
-        
-        EXPECT_TRUE( output.size() == 9);
+        EXPECT_EQ( blocks_read, dataset.BlockCount() );
     };
 }
\ No newline at end of file
diff --git a/MilaTest/MilaTest.vcxproj b/MilaTest/MilaTest.vcxproj
index f5c1510..9e81991 100644
--- a/MilaTest/MilaTest.vcxproj
+++ b/MilaTest/MilaTest.vcxproj
@@ -37,6 +37,8 @@
   <ItemGroup>
     <ClCompile Include="Cuda\Device.cpp" />
     <ClCompile Include="Cudnn\Utils.cpp" />
+    <ClCompile Include="Dataset\Encoding.cpp" />
+    <ClCompile Include="Dataset\Generator.cpp" />
     <ClCompile Include="Dataset\Loader.cpp" />
     <ClCompile Include="Dnn\Context.cpp">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
diff --git a/MilaTest/MilaTest.vcxproj.filters b/MilaTest/MilaTest.vcxproj.filters
index 00e747d..890af0d 100644
--- a/MilaTest/MilaTest.vcxproj.filters
+++ b/MilaTest/MilaTest.vcxproj.filters
@@ -29,6 +29,8 @@
     <ClCompile Include="Dataset\Loader.cpp">
       <Filter>Dataset</Filter>
     </ClCompile>
+    <ClCompile Include="Dataset\Generator.cpp" />
+    <ClCompile Include="Dataset\Encoding.cpp" />
   </ItemGroup>
   <ItemGroup>
     <None Include="packages.config" />
diff --git a/Samples/MilaPackage/MilaPackage.vcxproj b/Samples/MilaPackage/MilaPackage.vcxproj
index 9cbebcd..619dcfa 100644
--- a/Samples/MilaPackage/MilaPackage.vcxproj
+++ b/Samples/MilaPackage/MilaPackage.vcxproj
@@ -86,12 +86,12 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="..\..\packages\Achilles.Software.Mila.0.9.3-alpha\build\native\Achilles.Software.Mila.targets" Condition="Exists('..\..\packages\Achilles.Software.Mila.0.9.3-alpha\build\native\Achilles.Software.Mila.targets')" />
+    <Import Project="..\..\packages\Achilles.Mila.0.9.4-alpha\build\native\Achilles.Mila.targets" Condition="Exists('..\..\packages\Achilles.Mila.0.9.4-alpha\build\native\Achilles.Mila.targets')" />
   </ImportGroup>
   <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
     <PropertyGroup>
       <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
     </PropertyGroup>
-    <Error Condition="!Exists('..\..\packages\Achilles.Software.Mila.0.9.3-alpha\build\native\Achilles.Software.Mila.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\..\packages\Achilles.Software.Mila.0.9.3-alpha\build\native\Achilles.Software.Mila.targets'))" />
+    <Error Condition="!Exists('..\..\packages\Achilles.Mila.0.9.4-alpha\build\native\Achilles.Mila.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\..\packages\Achilles.Mila.0.9.4-alpha\build\native\Achilles.Mila.targets'))" />
   </Target>
 </Project>
\ No newline at end of file
diff --git a/Samples/MilaPackage/packages.config b/Samples/MilaPackage/packages.config
index 4197c8e..3104de1 100644
--- a/Samples/MilaPackage/packages.config
+++ b/Samples/MilaPackage/packages.config
@@ -1,4 +1,4 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Achilles.Software.Mila" version="0.9.3-alpha" targetFramework="native" />
+  <package id="Achilles.Mila" version="0.9.4-alpha" targetFramework="native" />
 </packages>
\ No newline at end of file