diff --git a/Project.toml b/Project.toml index 9f3b89b..381c0e8 100644 --- a/Project.toml +++ b/Project.toml @@ -4,16 +4,19 @@ authors = ["Bernard Brenyah", "Andrey Oskin"] version = "0.1.0" [deps] +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] -StatsBase = "0.32" -julia = "1.3" +StatsBase = "0.32, 0.33" +julia = "1.3, 1.4" [extras] +MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Random", "Suppressor"] +test = ["Test", "Random", "Suppressor", "MLJBase"] diff --git a/README.md b/README.md index 98132f8..3d9f522 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ ________________________________________________________________________________ _________________________________________________________________________________________________________ ## Table Of Content + 1. [Documentation](#Documentation) 2. [Installation](#Installation) 3. [Features](#Features) @@ -18,6 +19,7 @@ ________________________________________________________________________________ _________________________________________________________________________________________________________ ### Documentation + - Stable Documentation: [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://PyDataBlog.github.io/ParallelKMeans.jl/stable) - Experimental Documentation: [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://PyDataBlog.github.io/ParallelKMeans.jl/dev) @@ -25,6 +27,7 @@ ________________________________________________________________________________ _________________________________________________________________________________________________________ ### Installation + You can grab the latest stable version of this package by simply running in Julia. Don't forget to Julia's package manager with `]` @@ -39,9 +42,11 @@ pkg> dev git@github.com:PyDataBlog/ParallelKMeans.jl.git ``` Don't forget to checkout the experimental branch and you are good to go with bleeding edge features and breaks! + ```bash git checkout experimental ``` + _________________________________________________________________________________________________________ ### Features diff --git a/docs/Manifest.toml b/docs/Manifest.toml index e379175..94d70fb 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -19,9 +19,9 @@ version = "0.8.1" [[Documenter]] deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "d497bcc45bb98a1fbe19445a774cfafeabc6c6df" +git-tree-sha1 = "646ebc3db49889ffeb4c36f89e5d82c6a26295ff" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.24.5" +version = "0.24.7" [[InteractiveUtils]] deps = ["Markdown"] @@ -51,9 +51,9 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[Parsers]] deps = ["Dates", "Test"] -git-tree-sha1 = "d112c19ccca00924d5d3a38b11ae2b4b268dda39" +git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "0.3.11" +version = "0.3.12" [[Pkg]] deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] diff --git a/docs/src/index.md b/docs/src/index.md index a81abe1..d858e92 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -5,8 +5,9 @@ Depth = 4 ``` ## Motivation + It's actually a funny story led to the development of this package. -What started off as a personal toy project trying to re-construct the K-Means algorithm in native Julia blew up after a heated discussion on the Julia Discourse forum when I asked for Julia optimizaition tips. Long story short, Julia community is an amazing one! Andrey offered his help and together, we decided to push the speed limits of Julia with a parallel implementation of the most famous clustering algorithm. The initial results were mind blowing so we have decided to tidy up the implementation and share with the world as a maintained Julia pacakge. +What started off as a personal toy project trying to re-construct the K-Means algorithm in native Julia blew up after a heated discussion on the Julia Discourse forum when I asked for Julia optimizaition tips. Long story short, Julia community is an amazing one! Andrey offered his help and together, we decided to push the speed limits of Julia with a parallel implementation of the most famous clustering algorithm. The initial results were mind blowing so we have decided to tidy up the implementation and share with the world as a maintained Julia pacakge. Say hello to `ParallelKMeans`! @@ -15,16 +16,18 @@ This package aims to utilize the speed of Julia and parallelization (both CPU & In short, we hope this package will eventually mature as the "one stop" shop for everything KMeans on both CPUs and GPUs. ## K-Means Algorithm Implementation Notes + Since Julia is a column major language, the input (design matrix) expected by the package in the following format; - Design matrix X of size n×m, the i-th column of X `(X[:, i])` is a single data point in n-dimensional space. - Thus, the rows of the design design matrix represents the feature space with the columns representing all the training examples in this feature space. -One of the pitfalls of K-Means algorithm is that it can fall into a local minima. +One of the pitfalls of K-Means algorithm is that it can fall into a local minima. This implementation inherits this problem like every implementation does. As a result, it is useful in practice to restart it several times to get the correct results. ## Installation + You can grab the latest stable version of this package from Julia registries by simply running; *NB:* Don't forget to Julia's package manager with `]` @@ -40,19 +43,21 @@ dev git@github.com:PyDataBlog/ParallelKMeans.jl.git ``` Don't forget to checkout the experimental branch and you are good to go with bleeding edge features and breaks! + ```bash git checkout experimental ``` ## Features + - Lightening fast implementation of Kmeans clustering algorithm even on a single thread in native Julia. - Support for multi-theading implementation of Kmeans clustering algorithm. - 'Kmeans++' initialization for faster and better convergence. - Modified version of Elkan's Triangle inequality to speed up K-Means algorithm. - ## Pending Features -- [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster). + +- [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster). - [ ] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf). - [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf). - [ ] Support for DataFrame inputs. @@ -63,9 +68,9 @@ git checkout experimental - [ ] Improved Documentation - [ ] More benchmark tests - ## How To Use -Taking advantage of Julia's brilliant multiple dispatch system, the package exposes users to a very easy to use API. + +Taking advantage of Julia's brilliant multiple dispatch system, the package exposes users to a very easy to use API. ```julia using ParallelKMeans @@ -83,7 +88,7 @@ The main design goal is to offer all available variations of the KMeans algorith some_results = kmeans([algo], input_matrix, k; kwargs) # example -r = kmeans(Lloyd(), X, 3) # same result as the default +r = kmeans(Lloyd(), X, 3) # same result as the default ``` ```julia @@ -95,30 +100,31 @@ r.iterations # number of elapsed iterations r.converged # whether the procedure converged ``` -### Supported KMeans algorithm variations. -- [Lloyd()](https://cs.nyu.edu/~roweis/csc2515-2006/readings/lloyd57.pdf) -- [Hamerly()](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster) +### Supported KMeans algorithm variations + +- [Lloyd()](https://cs.nyu.edu/~roweis/csc2515-2006/readings/lloyd57.pdf) +- [Hamerly()](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster) - [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon) -- [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - (Coming soon) +- [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - (Coming soon) - [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon) - ### Practical Usage Examples + Some of the common usage examples of this package are as follows: #### Clustering With A Desired Number Of Groups -```julia +```julia using ParallelKMeans, RDatasets, Plots # load the data -iris = dataset("datasets", "iris"); +iris = dataset("datasets", "iris"); # features to use for clustering -features = collect(Matrix(iris[:, 1:4])'); +features = collect(Matrix(iris[:, 1:4])'); # various artificats can be accessed from the result ie assigned labels, cost value etc -result = kmeans(features, 3); +result = kmeans(features, 3); # plot with the point color mapped to the assigned cluster index scatter(iris.PetalLength, iris.PetalWidth, marker_z=result.assignments, @@ -129,6 +135,7 @@ scatter(iris.PetalLength, iris.PetalWidth, marker_z=result.assignments, ![Image description](iris_example.jpg) #### Elbow Method For The Selection Of optimal number of clusters + ```julia using ParallelKMeans @@ -140,21 +147,18 @@ c = [ParallelKMeans.kmeans(X, i; tol=1e-6, max_iters=300, verbose=false).totalco ``` - ## Benchmarks -Currently, this package is benchmarked against similar implementation in both Python and Julia. All reproducible benchmarks can be found in [ParallelKMeans/extras](https://github.com/PyDataBlog/ParallelKMeans.jl/tree/master/extras) directory. More tests in various languages are planned beyond the initial release version (`0.1.0`). - -*Note*: All benchmark tests are made on the same computer to help eliminate any bias. +Currently, this package is benchmarked against similar implementation in both Python and Julia. All reproducible benchmarks can be found in [ParallelKMeans/extras](https://github.com/PyDataBlog/ParallelKMeans.jl/tree/master/extras) directory. More tests in various languages are planned beyond the initial release version (`0.1.0`). -Currently, the benchmark speed tests are based on the search for optimal number of clusters using the [Elbow Method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)) since this is a practical use case for most practioners employing the K-Means algorithm. +*Note*: All benchmark tests are made on the same computer to help eliminate any bias. +Currently, the benchmark speed tests are based on the search for optimal number of clusters using the [Elbow Method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)) since this is a practical use case for most practioners employing the K-Means algorithm. ### Benchmark Results ![benchmark_image.png](benchmark_image.png) - _________________________________________________________________________________________________________ | 1 million (ms) | 100k (ms) | 10k (ms) | 1k (ms) | package | language | @@ -168,12 +172,12 @@ ________________________________________________________________________________ _________________________________________________________________________________________________________ +## Release History -## Release History - 0.1.0 Initial release - ## Contributing + Ultimately, we see this package as potentially the one stop shop for everything related to KMeans algorithm and its speed up variants. We are open to new implementations and ideas from anyone interested in this project. Detailed contribution guidelines will be added in upcoming releases. diff --git a/src/ParallelKMeans.jl b/src/ParallelKMeans.jl index 2f3be1a..eda9924 100644 --- a/src/ParallelKMeans.jl +++ b/src/ParallelKMeans.jl @@ -1,13 +1,16 @@ module ParallelKMeans using StatsBase +using MLJModelInterface import Base.Threads: @spawn +import Distances include("seeding.jl") include("kmeans.jl") include("lloyd.jl") include("light_elkan.jl") include("hamerly.jl") +include("mlj_interface.jl") export kmeans export Lloyd, LightElkan, Hamerly diff --git a/src/kmeans.jl b/src/kmeans.jl index 1a615c8..ff22d47 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -173,7 +173,7 @@ end """ - Kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true) + Kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=false) Mutable version of `kmeans` function. Definition of arguments and results can be found in `kmeans`. diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl new file mode 100644 index 0000000..0d49fcd --- /dev/null +++ b/src/mlj_interface.jl @@ -0,0 +1,126 @@ +# TODO 1: a using MLJModelInterface or import MLJModelInterface statement + +#### +#### MODEL DEFINITION +#### +# TODO 2: MLJ-compatible model types and constructors +@mlj_model mutable struct KMeans <: MLJModelInterface.Unsupervised + # Hyperparameters of the model + algo::Symbol = :Lloyd::(_ in (:Lloyd, :Hamerly, :LightElkan)) + k_init::String = "k-means++"::(_ in ("k-means++", String)) # allow user seeding? + k::Int = 3::(_ > 0) + tol::Float64 = 1e-6::(_ < 1) + max_iters::Int = 300::(_ > 0) + copy::Bool = true + threads::Int = Threads.nthreads()::(_ > 0) + verbosity::Int = 0::(_ in (0, 1)) # Temp fix. Do we need to follow mlj verbosity style? + init = nothing +end + + +# Expose all instances of user specified structs and package artifcats. +const ParallelKMeans_Desc = "Parallel & lightning fast implementation of all variants of the KMeans clustering algorithm in native Julia." + +# availalbe variants for reference +const MLJDICT = Dict(:Lloyd => Lloyd(), + :Hamerly => Hamerly(), + :LightElkan => LightElkan()) + +# TODO 3: implementation of fit, predict, and fitted_params of the model +#### +#### FIT FUNCTION +#### +""" + TODO 3.1: Docs + + See also the [package documentation](https://pydatablog.github.io/ParallelKMeans.jl/stable). +""" +function MLJModelInterface.fit(m::KMeans, X) + # fit the specified struct as a ParaKMeans model + + # convert tabular input data into the matrix model expects. Column assumed as features so input data is permuted + if !m.copy + # transpose input table without copying and pass to model + DMatrix = convert(Array{Float64, 2}, X)' + else + # tranposes input table as a column major matrix after making a copy of the data + DMatrix = MLJModelInterface.matrix(X; transpose=true) + end + + # lookup available algorithms + algo = MLJDICT[m.algo] # select algo + + # fit model and get results + verbose = m.verbosity != 0 + fitresult = ParallelKMeans.kmeans(algo, DMatrix, m.k; + n_threads = m.threads, k_init=m.k_init, + max_iters=m.max_iters, tol=m.tol, init=m.init, + verbose=verbose) + cache = nothing + report = (cluster_centers=fitresult.centers, iterations=fitresult.iterations, + converged=fitresult.converged, totalcost=fitresult.totalcost, + labels=fitresult.assignments) + + return (fitresult, cache, report) +end + + +""" + TODO 3.2: Docs +""" +function MLJModelInterface.fitted_params(model::KMeans, fitresult) + # extract what's relevant from `fitresult` + results, _, _ = fitresult # unpack fitresult + centers = results.centers + converged = results.converged + iters = results.iterations + totalcost = results.totalcost + + # then return as a NamedTuple + return (cluster_centers = centers, totalcost = totalcost, + iterations = iters, converged = converged) +end + + +#### +#### PREDICT FUNCTION +#### +""" + TODO 3.3: Docs +""" +function MLJModelInterface.transform(m::KMeans, fitresult, Xnew) + # make predictions/assignments using the learned centroids + results = fitresult[1] + DMatrix = MLJModelInterface.matrix(Xnew, transpose=true) + + # TODO 3.3.1: Warn users if fitresult is from a `non-converged` fit. + # use centroid matrix to assign clusters for new data + centroids = results.centers + distances = Distances.pairwise(Distances.SqEuclidean(), DMatrix, centroids; dims=2) + preds = argmin.(eachrow(distances)) + return MLJModelInterface.table(reshape(preds, :, 1), prototype=Xnew) +end + + +#### +#### METADATA +#### + +# TODO 4: metadata for the package and for each of the model interfaces +metadata_pkg.(KMeans, + name = "ParallelKMeans", + uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af", # see your Project.toml + url = "https://github.com/PyDataBlog/ParallelKMeans.jl", # URL to your package repo + julia = true, # is it written entirely in Julia? + license = "MIT", # your package license + is_wrapper = false, # does it wrap around some other package? +) + + +# Metadata for ParaKMeans model interface +metadata_model(KMeans, + input = MLJModelInterface.Table(MLJModelInterface.Continuous), + output = MLJModelInterface.Table(MLJModelInterface.Count), + weights = false, + descr = ParallelKMeans_Desc, + path = "ParallelKMeans.src.mlj_interface.KMeans") diff --git a/test/test04_elkan.jl b/test/test04_elkan.jl index 44c1d98..bf31651 100644 --- a/test/test04_elkan.jl +++ b/test/test04_elkan.jl @@ -1,7 +1,7 @@ module TestElkan using ParallelKMeans -using ParallelKMeans: LightElkan, update_containers! +using ParallelKMeans: update_containers! using Test using Random diff --git a/test/test05_verbose.jl b/test/test06_verbose.jl similarity index 98% rename from test/test05_verbose.jl rename to test/test06_verbose.jl index 1992bcb..55b4730 100644 --- a/test/test05_verbose.jl +++ b/test/test06_verbose.jl @@ -13,7 +13,6 @@ using Suppressor # Capture output and compare r = @capture_out kmeans(Lloyd(), X, 3; n_threads=1, max_iters=1, verbose=true) @test r == "Iteration 1: Jclust = 46.534795844478815\n" - end end # module diff --git a/test/test07_mlj_interface.jl b/test/test07_mlj_interface.jl new file mode 100644 index 0000000..ffdfa6c --- /dev/null +++ b/test/test07_mlj_interface.jl @@ -0,0 +1,112 @@ +module TestMLJInterface + +using ParallelKMeans +using Random +using Test +using Suppressor +using MLJBase + + +@testset "Test struct construction" begin + model = KMeans() + + @test model.algo == :Lloyd + @test model.init == nothing + @test model.k == 3 + @test model.k_init == "k-means++" + @test model.max_iters == 300 + @test model.copy == true + @test model.threads == Threads.nthreads() + @test model.tol == 1.0e-6 + @test model.verbosity == 0 +end + + +@testset "Test model fitting verbosity" begin + Random.seed!(2020) + X = table([1 2; 1 4; 1 0; 10 2; 10 4; 10 0]) + model = KMeans(k=2, max_iters=1, verbosity=1) + results = @capture_out fit(model, X) + + @test results == "Iteration 1: Jclust = 28.0\n" +end + + +@testset "Test Lloyd model fitting" begin + Random.seed!(2020) + X = table([1 2; 1 4; 1 0; 10 2; 10 4; 10 0]) + model = KMeans(k=2) + results = fit(model, X) + + @test results[2] == nothing + @test results[end].converged == true + @test results[end].totalcost == 16 +end + + +@testset "Test Hamerly model fitting" begin + Random.seed!(2020) + X = table([1 2; 1 4; 1 0; 10 2; 10 4; 10 0]) + model = KMeans(algo=:Hamerly, k=2) + results = fit(model, X) + + @test results[2] == nothing + @test results[end].converged == true + @test results[end].totalcost == 16 +end + + +@testset "Test Lloyd fitted params" begin + Random.seed!(2020) + X = table([1 2; 1 4; 1 0; 10 2; 10 4; 10 0]) + model = KMeans(k=2) + results = fit(model, X) + + params = fitted_params(model, results) + @test params.converged == true + @test params.totalcost == 16 +end + + +@testset "Test Hamerly fitted params" begin + Random.seed!(2020) + X = table([1 2; 1 4; 1 0; 10 2; 10 4; 10 0]) + model = KMeans(algo=:Hamerly, k=2) + results = fit(model, X) + + params = fitted_params(model, results) + @test params.converged == true + @test params.totalcost == 16 +end + + +@testset "Test Lloyd transform" begin + Random.seed!(2020) + X = table([1 2; 1 4; 1 0; 10 2; 10 4; 10 0]) + X_test = table([10 1]) + + # Train model using training data X + model = KMeans(k=2) + results = fit(model, X) + + # Use trained model to cluster new data X_test + preds = transform(model, results, X_test) + @test preds[:x1][1] == 2 +end + + +@testset "Test Hamerly transform" begin + Random.seed!(2020) + X = table([1 2; 1 4; 1 0; 10 2; 10 4; 10 0]) + X_test = table([10 1]) + + # Train model using training data X + model = KMeans(algo=:Hamerly, k=2) + results = fit(model, X) + + # Use trained model to cluster new data X_test + preds = transform(model, results, X_test) + @test preds[:x1][1] == 2 +end + +end # module