From 8994d4714b188f996a9a6dfe61b53d43269e07fc Mon Sep 17 00:00:00 2001 From: Todd Thomson Date: Sat, 22 Jan 2022 20:06:18 -0800 Subject: [PATCH] Progress towards beta release #1 --- Mila/Mila.nuspec | 1 + Mila/Mila.vcxproj | 14 ++-- Mila/Mila.vcxproj.filters | 17 ++--- Mila/{Mila.Core.ixx => Source/Core/Core.ixx} | 2 +- .../Dnn/CuDNN/{MilaCuDnn.ixx => CuDnn.ixx} | 2 +- Mila/Source/Dnn/Data/Data.ixx | 27 ++++++++ ...oder.ixx => DataCategoryVectorEncoder.ixx} | 6 +- ...taset.ixx => DataCharDatasetGenerator.ixx} | 44 ++++++------ ...{DataDatasetLoader.ixx => DataDataset.ixx} | 68 ++++++++++--------- .../Dnn/Data/H5/DataH5DatasetWriter.ixx | 16 +---- Mila/Source/Dnn/Dnn.ixx | 8 +++ Mila/Source/Dnn/MilaDnn.ixx | 3 - Mila/Source/Mila.ixx | 6 +- MilaTest/Dataset/Encoding.cpp | 21 ++++++ MilaTest/Dataset/Generator.cpp | 26 +++++++ MilaTest/Dataset/Loader.cpp | 45 +++--------- MilaTest/MilaTest.vcxproj | 2 + MilaTest/MilaTest.vcxproj.filters | 2 + Samples/MilaPackage/MilaPackage.vcxproj | 4 +- Samples/MilaPackage/packages.config | 2 +- 20 files changed, 186 insertions(+), 130 deletions(-) rename Mila/{Mila.Core.ixx => Source/Core/Core.ixx} (95%) rename Mila/Source/Dnn/CuDNN/{MilaCuDnn.ixx => CuDnn.ixx} (96%) create mode 100644 Mila/Source/Dnn/Data/Data.ixx rename Mila/Source/Dnn/Data/{DataCategoryToVectorEncoder.ixx => DataCategoryVectorEncoder.ixx} (94%) rename Mila/Source/Dnn/Data/{DataTextToDataset.ixx => DataCharDatasetGenerator.ixx} (84%) rename Mila/Source/Dnn/Data/{DataDatasetLoader.ixx => DataDataset.ixx} (74%) create mode 100644 Mila/Source/Dnn/Dnn.ixx delete mode 100644 Mila/Source/Dnn/MilaDnn.ixx create mode 100644 MilaTest/Dataset/Encoding.cpp create mode 100644 MilaTest/Dataset/Generator.cpp diff --git a/Mila/Mila.nuspec b/Mila/Mila.nuspec index d41d5d6..be1b302 100644 --- a/Mila/Mila.nuspec +++ b/Mila/Mila.nuspec @@ -19,6 +19,7 @@ + diff --git a/Mila/Mila.vcxproj b/Mila/Mila.vcxproj index 3021f7c..00668c8 100644 --- a/Mila/Mila.vcxproj +++ b/Mila/Mila.vcxproj @@ -17,7 +17,7 @@ - + @@ -29,7 +29,7 @@ - + @@ -40,12 +40,14 @@ + + - + @@ -55,13 +57,12 @@ - - - + + @@ -129,6 +130,7 @@ _DEBUG;_LIB;%(PreprocessorDefinitions);WIN32;_WINDOWS $(IntDir) $(IntDir)%(FileName).ifc + false %(PreprocessorDefinitions);WIN32;_DEBUG;_WINDOWS;CMAKE_INTDIR=\"Debug\" diff --git a/Mila/Mila.vcxproj.filters b/Mila/Mila.vcxproj.filters index 96c648c..23e9f38 100644 --- a/Mila/Mila.vcxproj.filters +++ b/Mila/Mila.vcxproj.filters @@ -29,7 +29,7 @@ Dnn\CuDnn - + Dnn\CuDnn @@ -101,7 +101,7 @@ Dnn - + Dnn @@ -137,31 +137,32 @@ Dnn\Data\H5 - - Dnn\Data - Dnn\Data Dnn\Data - + Dnn\Data - + Dnn\Data Core - + Core Dnn\Data\H5 + + Dnn\Data + + diff --git a/Mila/Mila.Core.ixx b/Mila/Source/Core/Core.ixx similarity index 95% rename from Mila/Mila.Core.ixx rename to Mila/Source/Core/Core.ixx index 3141865..6240a6b 100644 --- a/Mila/Mila.Core.ixx +++ b/Mila/Source/Core/Core.ixx @@ -19,6 +19,6 @@ * DEALINGS IN THE SOFTWARE. */ -export module Mila.Core; +export module Core; export import Core.Version; \ No newline at end of file diff --git a/Mila/Source/Dnn/CuDNN/MilaCuDnn.ixx b/Mila/Source/Dnn/CuDNN/CuDnn.ixx similarity index 96% rename from Mila/Source/Dnn/CuDNN/MilaCuDnn.ixx rename to Mila/Source/Dnn/CuDNN/CuDnn.ixx index 390fe5d..f041e41 100644 --- a/Mila/Source/Dnn/CuDNN/MilaCuDnn.ixx +++ b/Mila/Source/Dnn/CuDNN/CuDnn.ixx @@ -19,7 +19,7 @@ * DEALINGS IN THE SOFTWARE. */ -export module Mila.Cudnn; +export module Cudnn; export import CuDnn.Context; export import CuDnn.Descriptor; diff --git a/Mila/Source/Dnn/Data/Data.ixx b/Mila/Source/Dnn/Data/Data.ixx new file mode 100644 index 0000000..0d66912 --- /dev/null +++ b/Mila/Source/Dnn/Data/Data.ixx @@ -0,0 +1,27 @@ +/* + * Copyright 2021 Todd Thomson, Achilles Software. All rights reserved. + * + * Please refer to the Mila end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +export module Data; + +export import Data.Dataset; +export import Data.DatasetType; +export import Data.CharDatasetGenerator; +export import Data.CategoryVectorEncoder; diff --git a/Mila/Source/Dnn/Data/DataCategoryToVectorEncoder.ixx b/Mila/Source/Dnn/Data/DataCategoryVectorEncoder.ixx similarity index 94% rename from Mila/Source/Dnn/Data/DataCategoryToVectorEncoder.ixx rename to Mila/Source/Dnn/Data/DataCategoryVectorEncoder.ixx index 1fcdf59..26edf21 100644 --- a/Mila/Source/Dnn/Data/DataCategoryToVectorEncoder.ixx +++ b/Mila/Source/Dnn/Data/DataCategoryVectorEncoder.ixx @@ -25,7 +25,7 @@ module; #include #include -export module Data.CategoryToVectorEncoder; +export module Data.CategoryVectorEncoder; namespace Mila::Dnn::Data { @@ -33,7 +33,7 @@ namespace Mila::Dnn::Data /// A category to vector (One-hot) encoder /// export template - class CategoryToVectorEncoder + class CategoryVectorEncoder { public: @@ -42,7 +42,7 @@ namespace Mila::Dnn::Data /// /// Size of vector /// Numeric value to use for marking category - CategoryToVectorEncoder( size_t k, TElement value ) + CategoryVectorEncoder( size_t k, TElement value ) : k_( k ), value_( value ) { k_vector_ = std::vector( k_, {} ); diff --git a/Mila/Source/Dnn/Data/DataTextToDataset.ixx b/Mila/Source/Dnn/Data/DataCharDatasetGenerator.ixx similarity index 84% rename from Mila/Source/Dnn/Data/DataTextToDataset.ixx rename to Mila/Source/Dnn/Data/DataCharDatasetGenerator.ixx index c036d61..8c1ed8d 100644 --- a/Mila/Source/Dnn/Data/DataTextToDataset.ixx +++ b/Mila/Source/Dnn/Data/DataCharDatasetGenerator.ixx @@ -49,8 +49,9 @@ module; #include #include -export module Data.TextToDataset; +export module Data.CharDatasetGenerator; +import Data.DatasetType; import Data.H5DatasetWriter; namespace fs = std::filesystem; @@ -61,18 +62,18 @@ namespace Mila::Dnn::Data /// /// A class to convert a text file to an H5 formatted dataset. /// - export class TextToDataset + export class CharDatasetGenerator { public: /// /// Creates a dataset from a text file with the specified test, validation - /// and training splits. + /// splits. /// /// /// /// - TextToDataset( const fs::path& file_path, float test_split = 0.1f, float validation_split = 0.1f ) + CharDatasetGenerator( const fs::path& file_path, float test_split = 0.1f, float validation_split = 0.1f ) : file_path_( file_path), test_split_( test_split ), validation_split_( validation_split ) { if ( test_split + validation_split >= 1.0f ) @@ -87,9 +88,9 @@ namespace Mila::Dnn::Data } /// - /// + /// Generates a dataset from the text file. /// - void CreateDataset() + void GenerateDataset() { std::ifstream text_file( file_path_ ); @@ -99,8 +100,8 @@ namespace Mila::Dnn::Data } // First go through the file once to determine its size and to - // build the vocabulary token map. - std::map tokens = {}; + // build the vocabulary map. + std::map vocabulary = {}; int total_chars = 0; int index = 1; std::string line; @@ -111,19 +112,19 @@ namespace Mila::Dnn::Data for ( char c : line ) { - auto it = tokens.find( c ); - if ( it == tokens.end() ) + auto it = vocabulary.find( c ); + if ( it == vocabulary.end() ) { - tokens.insert( {c, index++} ); + vocabulary.insert( {c, index++} ); } } } std::cout << "File size: " << std::to_string( total_chars ) << std::endl; - std::cout << "Vocab size: " << std::to_string( tokens.size() ) << std::endl; + std::cout << "Vocab size: " << std::to_string( vocabulary.size() ) << std::endl; - vocabulary_size = tokens.size(); - text_size = total_chars; + vocabulary_size_ = vocabulary.size(); + text_size_ = total_chars; // Now we can figure out the split sizes int validation_size = validation_split_ * total_chars; @@ -152,7 +153,7 @@ namespace Mila::Dnn::Data for ( char& c : line ) { - splits[ split_index ][ split_position++ ] = tokens[ c ]; + splits[ split_index ][ split_position++ ] = vocabulary[ c ]; if ( split_position >= splits[ split_index ].size() ) { @@ -172,15 +173,14 @@ namespace Mila::Dnn::Data h5Writer.WriteDataset( "validation_ds", splits[ VALIDATION_SET ] ); h5Writer.WriteDataset( "testing_ds", splits[ TESTING_SET ] ); - // Convert the m + // Write the vocabulary map to a linear vector std::vector vocabulary_vector; - - for ( const auto& [key, value ] : tokens ) { + for ( const auto& [key, value ] : vocabulary ) { vocabulary_vector.push_back( key ); vocabulary_vector.push_back( value ); } - h5Writer.WriteDataset( "vocabulary_ds", vocabulary_vector ); + h5Writer.WriteDataset( to_string( DatasetType::vocabulary ), vocabulary_vector ); } private: @@ -190,8 +190,10 @@ namespace Mila::Dnn::Data float validation_split_ = 0.1f; float test_split_ = 0.1f; - size_t vocabulary_size = 0; - size_t text_size = 0; + std::map vocabulary_ = {}; + + size_t vocabulary_size_ = 0; + size_t text_size_ = 0; const int TRAINING_SET = 0; const int VALIDATION_SET = 1; diff --git a/Mila/Source/Dnn/Data/DataDatasetLoader.ixx b/Mila/Source/Dnn/Data/DataDataset.ixx similarity index 74% rename from Mila/Source/Dnn/Data/DataDatasetLoader.ixx rename to Mila/Source/Dnn/Data/DataDataset.ixx index 8d49c79..38b6668 100644 --- a/Mila/Source/Dnn/Data/DataDatasetLoader.ixx +++ b/Mila/Source/Dnn/Data/DataDataset.ixx @@ -22,13 +22,14 @@ module; #include #include +#include #include #include -export module Data.DatasetLoader; +export module Data.Dataset; -import Data.H5DatasetReader; import Data.DatasetType; +import Data.H5DatasetReader; namespace fs = std::filesystem; @@ -39,34 +40,41 @@ namespace Mila::Dnn::Data /// export using XYPair = std::pair, std::vector>; - /// - /// Character dataset loader. - /// - export class DatasetLoader + export class Dataset { public: - /// - /// DatasetLoader constructor - /// - /// - /// - /// - /// - DatasetLoader( const DatasetType datasetType, fs::path datasetPath, int batchSize, int sequenceLength ) + Dataset( const fs::path& datasetPath, int batchSize, int sequenceLength ) + : path_( datasetPath ), batch_size_( batchSize ), sequence_length_( sequenceLength ) { - if ( !std::filesystem::exists( datasetPath ) ) + if ( !fs::exists( path_ ) ) { throw std::invalid_argument( "Dataset file does not exist." ); } - dataset_path_ = datasetPath; - batch_size_ = batchSize; - sequence_length_ = sequenceLength; + block_size_ = batch_size_ * sequence_length_; - block_size_ = batchSize * sequenceLength; + ReadVocabulary(); + } + + /// + /// Loads the specified dataset type. + /// + /// + void Load( const DatasetType datasetType ) + { + H5::H5DatasetReader ds_reader = H5::H5DatasetReader( path_.string() ); - ReadDataset( datasetType ); + ds_reader.ReadDataset( to_string( datasetType ), dataset_ ); + + int dataset_size = dataset_.size(); + + max_blocks_ = (dataset_size - 1) / block_size_; + } + + const std::map& GetVocabulary() + { + return vocabulary_; } /// @@ -111,23 +119,21 @@ namespace Mila::Dnn::Data private: - int ReadDataset( const DatasetType datasetType ) + void ReadVocabulary() { - H5::H5DatasetReader ds_reader = H5::H5DatasetReader( dataset_path_.string() ); - - //ds_reader.ReadDataset( "vocabulary_ds", dataset_); - ds_reader.ReadDataset( to_string( datasetType ), dataset_ ); + H5::H5DatasetReader ds_reader = H5::H5DatasetReader( path_.string() ); - int dataset_size = dataset_.size(); - - max_blocks_ = (dataset_size - 1) / block_size_; + std::vector vocabulary_vector; + ds_reader.ReadDataset( to_string( DatasetType::vocabulary ), vocabulary_vector ); - return 0; + for ( auto it = vocabulary_vector.begin(); it != vocabulary_vector.end(); it++ ) { + vocabulary_[ *it++ ] = *it; + } } private: - fs::path dataset_path_; + fs::path path_; int batch_size_; int sequence_length_; @@ -138,7 +144,7 @@ namespace Mila::Dnn::Data int max_blocks_ = 0; int block_size_ = 0; - std::map vocabulary_; + std::map vocabulary_; std::vector dataset_; }; } \ No newline at end of file diff --git a/Mila/Source/Dnn/Data/H5/DataH5DatasetWriter.ixx b/Mila/Source/Dnn/Data/H5/DataH5DatasetWriter.ixx index b8acbe2..18a04d2 100644 --- a/Mila/Source/Dnn/Data/H5/DataH5DatasetWriter.ixx +++ b/Mila/Source/Dnn/Data/H5/DataH5DatasetWriter.ixx @@ -36,18 +36,6 @@ using namespace H5; namespace Mila::Dnn::Data::H5 { - /*template - const PredType& get_data_type(); - - template<> - const PredType& get_data_type() { return PredType::NATIVE_FLOAT; } - - template<> - const PredType& get_data_type() { return PredType::NATIVE_INT; } - - template<> - const PredType& get_data_type() { return PredType::NATIVE_CHAR; }*/ - export class H5DatasetWriter { public: @@ -67,7 +55,7 @@ namespace Mila::Dnn::Data::H5 DataSpace dataspace( 1, splits_dimsf ); //PredType predType = get_data_type(); - IntType datatype( get_data_type() ); // PredType::NATIVE_UCHAR ); + IntType datatype( get_data_type() ); datatype.setOrder( H5T_ORDER_LE ); /* @@ -81,7 +69,7 @@ namespace Mila::Dnn::Data::H5 * Write the data to the dataset using default memory space, file * space, and transfer properties. */ - dataset.write( data.data(), get_data_type() ); // PredType::NATIVE_UCHAR ); + dataset.write( data.data(), get_data_type() ); } // end of try block diff --git a/Mila/Source/Dnn/Dnn.ixx b/Mila/Source/Dnn/Dnn.ixx new file mode 100644 index 0000000..803ff65 --- /dev/null +++ b/Mila/Source/Dnn/Dnn.ixx @@ -0,0 +1,8 @@ +export module Dnn; + +export import Dnn.Model; +export import Dnn.ModelBuilder; +export import Dnn.ModelOptions; + +export import Dnn.RnnOpDescriptor; +export import Dnn.DropoutDescriptor; \ No newline at end of file diff --git a/Mila/Source/Dnn/MilaDnn.ixx b/Mila/Source/Dnn/MilaDnn.ixx deleted file mode 100644 index f24335a..0000000 --- a/Mila/Source/Dnn/MilaDnn.ixx +++ /dev/null @@ -1,3 +0,0 @@ -export module Mila.Dnn; - -export void MyFunc(); \ No newline at end of file diff --git a/Mila/Source/Mila.ixx b/Mila/Source/Mila.ixx index f797da9..708ef00 100644 --- a/Mila/Source/Mila.ixx +++ b/Mila/Source/Mila.ixx @@ -21,8 +21,8 @@ export module Mila; -export import Mila.Core; -export import Mila.Dnn; +export import Core; +export import Dnn; //export import Mila.Cudnn; @@ -32,7 +32,7 @@ export namespace Mila { /// Gets the Mila API version. /// export Core::Version GetAPIVersion() { - return Core::Version{0, 9, 3}; + return Core::Version{0, 9, 4}; } } diff --git a/MilaTest/Dataset/Encoding.cpp b/MilaTest/Dataset/Encoding.cpp new file mode 100644 index 0000000..9310f08 --- /dev/null +++ b/MilaTest/Dataset/Encoding.cpp @@ -0,0 +1,21 @@ +#include "gtest/gtest.h" +#include +#include + + +import Data.CategoryVectorEncoder; + +using namespace Mila::Dnn::Data; + +namespace Mila::Test::Data +{ + TEST( Dataset, CategoryVector ) + { + std::vector input = { 2, 3, 1 }; + auto transformer = CategoryVectorEncoder( 3, 1.0 ); + + auto output = transformer.Convert( input ); + + EXPECT_TRUE( output.size() == 9); + }; +} \ No newline at end of file diff --git a/MilaTest/Dataset/Generator.cpp b/MilaTest/Dataset/Generator.cpp new file mode 100644 index 0000000..7343f35 --- /dev/null +++ b/MilaTest/Dataset/Generator.cpp @@ -0,0 +1,26 @@ +#include "gtest/gtest.h" +#include +#include + +using std::filesystem::current_path; + +import Data.CharDatasetGenerator; +import Data.Dataset; +import Data.DatasetType; + +using namespace Mila::Dnn::Data; + +namespace Mila::Test::Data +{ + TEST( Dataset, GenerateDataset ) + { + std::cout << "Current working directory: " << current_path() << std::endl; + + auto filepath = current_path(); + filepath += "\\Data\\tiny-shakespeare.txt"; + + CharDatasetGenerator dataset_generator = CharDatasetGenerator( filepath ); + + dataset_generator.GenerateDataset(); + }; +} \ No newline at end of file diff --git a/MilaTest/Dataset/Loader.cpp b/MilaTest/Dataset/Loader.cpp index 94f8cf5..0d127ed 100644 --- a/MilaTest/Dataset/Loader.cpp +++ b/MilaTest/Dataset/Loader.cpp @@ -4,29 +4,14 @@ using std::filesystem::current_path; -import Data.TextToDataset; -import Data.DatasetLoader; -import Data.CategoryToVectorEncoder; +import Data.Dataset; +import Data.DatasetType; using namespace Mila::Dnn::Data; namespace Mila::Test::Data { - TEST( Dataset, TextToDataset ) - { - std::cout << "Current working directory: " << current_path() << std::endl; - - auto filepath = current_path(); - filepath += "\\Data\\tiny-shakespeare.txt"; - - TextToDataset h5Converter = TextToDataset( filepath ); - - h5Converter.CreateDataset(); - - //EXPECT_TRUE( numErrors != 0 ); - }; - - TEST( Dataset, SequenceLoader ) + TEST( Dataset, Dataset_ReadsAllTrainingBlocks ) { std::cout << "Current working directory: " << current_path() << std::endl; @@ -36,30 +21,18 @@ namespace Mila::Test::Data int batch_size = 10; int sequence_length = 10; - DatasetLoader loader = DatasetLoader( - DatasetType::training, - filepath, - batch_size, - sequence_length ); + Dataset dataset = Dataset( filepath, batch_size, sequence_length ); + + dataset.Load( DatasetType::training ); int blocks_read = 0; - while (!loader.EndOfDataset()) + while (!dataset.EndOfDataset()) { - XYPair samples = loader.NextBlock(); + XYPair samples = dataset.NextBlock(); blocks_read++; } - EXPECT_EQ( blocks_read, loader.BlockCount() ); - }; - - TEST( Dataset, CategoryVector ) - { - std::vector input = { 2, 3, 1 }; - auto transformer = CategoryToVectorEncoder( 3, 1.0 ); - - auto output = transformer.Convert( input ); - - EXPECT_TRUE( output.size() == 9); + EXPECT_EQ( blocks_read, dataset.BlockCount() ); }; } \ No newline at end of file diff --git a/MilaTest/MilaTest.vcxproj b/MilaTest/MilaTest.vcxproj index f5c1510..9e81991 100644 --- a/MilaTest/MilaTest.vcxproj +++ b/MilaTest/MilaTest.vcxproj @@ -37,6 +37,8 @@ + + false diff --git a/MilaTest/MilaTest.vcxproj.filters b/MilaTest/MilaTest.vcxproj.filters index 00e747d..890af0d 100644 --- a/MilaTest/MilaTest.vcxproj.filters +++ b/MilaTest/MilaTest.vcxproj.filters @@ -29,6 +29,8 @@ Dataset + + diff --git a/Samples/MilaPackage/MilaPackage.vcxproj b/Samples/MilaPackage/MilaPackage.vcxproj index 9cbebcd..619dcfa 100644 --- a/Samples/MilaPackage/MilaPackage.vcxproj +++ b/Samples/MilaPackage/MilaPackage.vcxproj @@ -86,12 +86,12 @@ - + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + \ No newline at end of file diff --git a/Samples/MilaPackage/packages.config b/Samples/MilaPackage/packages.config index 4197c8e..3104de1 100644 --- a/Samples/MilaPackage/packages.config +++ b/Samples/MilaPackage/packages.config @@ -1,4 +1,4 @@  - + \ No newline at end of file