Permalink
Browse files

first implementation

  • Loading branch information...
1 parent 9367203 commit 4f1ab113d4c76fcc9a47a8abc74f0955323ed8d0 @Philmod committed Oct 22, 2012
Showing with 318 additions and 3 deletions.
  1. +2 −0 .gitignore
  2. +57 −3 README.md
  3. +1 −0 index.js
  4. +211 −0 lib/kmeans.js
  5. +14 −0 package.json
  6. +33 −0 test/index.js
View
@@ -0,0 +1,2 @@
+node_modules
+temp.js
View
@@ -1,4 +1,58 @@
-node-kmeans
-===========
+# node-kmeans
-Node.js implementation of the clustering algorithm k-means.
+ Node.js asynchronous implementation of the clustering algorithm k-means
+
+## Installation
+
+ $ npm install kmeans
+
+## Example
+
+```js
+// Data source: LinkedIn
+var data = [
+ {'company': 'Microsoft' , 'size': 91259, 'revenue': 60420},
+ {'company': 'IBM' , 'size': 400000, 'revenue': 98787},
+ {'company': 'Skype' , 'size': 700, 'revenue': 716},
+ {'company': 'SAP' , 'size': 48000, 'revenue': 11567},
+ {'company': 'Yahoo!' , 'size': 14000 , 'revenue': 6426 },
+ {'company': 'eBay' , 'size': 15000, 'revenue': 8700},
+];
+
+// Create the data 2D-array (vectors) describing the data
+var vectors = new Array ;
+for (var i = 0 ; i < data.length ; i++) {
+ vectors[i] = [ data[i]['size'] , data[i]['revenue']] ;
+}
+
+var kmeans = require('kmeans');
+new kmeans.clusterize(vectors, {k: 4}, function(err,res) {
+ if (err) console.error(err);
+ else console.log('%o',res);
+});
+```
+
+## License
+
+(The MIT License)
+
+Copyright (c) 2012 Philmod &lt;philippe.modard@gmail.com&gt;
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
View
@@ -0,0 +1 @@
+module.exports = require('./lib/kmeans');
View
@@ -0,0 +1,211 @@
+/*!
+ * node-kmeans
+ * Copyright(c) 2012 Philmod <philippe.modard@gmail.com>
+ * MIT Licensed
+ */
+
+/*
+Asynchronous implementation of the k-means clustering algorithm.
+
+The kmeans function takes as input the number k of clusters and a list of N input vectors and it outputs an object with two attributes:
+ - centroids: an Array of k vectors containing the centroid of each cluster
+ - assignments: An Array of size N representing for each input vector the index of the cluster
+
+The kmeans will return an error if:
+ - N < k
+ - The number of different input vectors is smaller than k
+*/
+
+
+ /**
+ * Module dependencies.
+ */
+
+var _ = require('underscore');
+
+
+ /**
+ * Library version.
+ */
+
+exports.version = '0.0.1';
+
+
+/**
+ * Expose `clusterize`.
+ */
+
+exports.clusterize = clusterize;
+
+
+/**
+ * Initialize a new `clusterize` with the given `vector` and number `k` of clusters.
+ *
+ * @param {Array} vector
+ * @param {Number} k
+ * @api public
+ */
+
+function clusterize(vector, options, callback) {
+ if (typeof callback != 'function') throw new Error("Provide a callback function");
+ if (!options || !options.k || options.k<1) return callback(new Error("Provide a correct number k of clusters"));
+ if (!_.isArray(vector)) return callback(new Error("Provide an array of data"));
+ this.options = options;
+ this.v = vector;
+ this.k = this.options.k;
+ if (this.v.length < this.k) return callback(new Error('The number of points must be greater than the number k of clusters'));
+
+ this.initialize(); // initialize the group arrays
+
+ var self = this
+ , moved = -1; // how many centroids have moved on the last iteration
+
+ (function iterate() {
+ if (moved === 0) return callback(null,self.output()); // converged if 0 centroid has moved
+ moved = 0;
+ for (var i=0; i<self.groups.length; ++i) {
+ self.groups[i].defineCentroid(self); // define the new centroids
+ self.groups[i].distanceObjects(self); // calculate the distances from centroids to every items
+ }
+ self.clustering(); // clustering by choosing the centroid the closest of each item
+ for (var i=0; i<self.groups.length; ++i)
+ if (self.groups[i].centroidMoved) moved++; // check how many centroids have moved in this iteration
+ process.nextTick(iterate);
+ })();
+};
+
+
+/**
+ * Initialize the groups arrays
+ */
+clusterize.prototype.initialize = function() {
+ this.groups = new Array();
+ for (var i=0; i<this.k; ++i) {
+ this.groups[i] = new Group(this);
+ }
+ return this;
+};
+
+
+/**
+ * assign each object based on the minimum distance
+ */
+
+clusterize.prototype.clustering = function() {
+ for (var j=0; j<this.groups.length; ++j) this.groups[j].initCluster();
+ for (var i=0; i<this.v.length; ++i) {
+ var min = this.groups[0].distances[i];
+ var indexGroup = 0;
+ for (var j=1; j<this.groups.length; ++j) {
+ if (this.groups[j].distances[i] < min) {
+ min = this.groups[j].distances[i]
+ indexGroup = j;
+ }
+ }
+ this.groups[indexGroup].cluster.push(this.v[i]);
+ this.groups[indexGroup].clusterInd.push(i);
+ }
+ return this;
+};
+
+
+/**
+ * output structure
+ */
+
+clusterize.prototype.output = function() {
+ var out = new Array();
+ for (var j=0; j<this.groups.length; ++j) {
+ out[j] = _.pick(this.groups[j],'centroid','cluster','clusterInd');
+ }
+ return out;
+};
+
+
+/**
+ * Compute the Euclidean distance
+ *
+ * @param {Array} a
+ * @param {Array} b
+ * @api private
+ */
+
+function distance(a,b){
+ if (a.length != b.length) return (new Error('The vectors must have the same length'));
+ var d = 0.0;
+ for (var i=0; i<a.length; ++i) d += Math.pow((a[i]-b[i]),2);
+ return Math.sqrt(d);
+};
+
+
+/**
+ * Group
+ */
+
+function Group() {
+ this.centroidMoved = true;
+}
+
+
+/**
+ * Group
+ */
+
+Group.prototype.initCluster = function() {
+ this.cluster = new Array(); // dimensions
+ this.clusterInd = new Array(); // index
+}
+
+
+/**
+ * Define Centroid
+ * - if they exist, calculate the new position
+ * - otherwise, randomly choose one existing item
+ */
+
+Group.prototype.defineCentroid = function(self){
+ this.centroidOld = (this.centroid) ? this.centroid : new Array();
+ if (this.centroid && this.cluster.length>0) {
+ this.calculateCentroid();
+ }
+ else { // random selection
+ this.centroidIndex = Math.floor(Math.random() * self.v.length);
+ this.centroid = new Array();
+ for (var i=0; i<self.v[this.centroidIndex].length; ++i)
+ this.centroid[i] = self.v[this.centroidIndex][i];
+ }
+ this.centroidMoved = (_.isEqual(this.centroid,this.centroidOld)) ? false : true;
+ if (this.centroid.length == 0) console.log('1. was passiert hier??');
+ return this;
+};
+
+
+/**
+ * calculate Centroid
+ */
+
+Group.prototype.calculateCentroid = function() {
+ this.centroid = new Array();
+ for (var i=0; i<this.cluster.length; ++i) { // loop through the cluster elements
+ for (var j=0; j<this.cluster[i].length; ++j) // loop through the dimensions
+ this.centroid[j] = (this.centroid[j]) ? this.centroid[j]+this.cluster[i][j] : this.cluster[i][j];
+ }
+ for (var i=0; i<this.centroid.length; ++i)
+ this.centroid[i] = this.centroid[i]/this.cluster.length; // average
+ return this
+};
+
+
+/**
+ * calculate the distance between cluster centroid to each object
+ */
+
+Group.prototype.distanceObjects = function(self) {
+ if (!this.distances) this.distances = new Array();
+ for (var i=0; i<self.v.length; ++i) {
+ this.distances[i] = distance(this.centroid, self.v[i]);
+ }
+ return this;
+};
+
+
View
@@ -0,0 +1,14 @@
+{
+ "name": "node-kmeans"
+ , "version": "0.0.1"
+ , "description": "Node.js implementation of the clustering algorithm k-means"
+ , "keywords": ["k-means", "clustering"]
+ , "author": "Philmod <philippe.modard@gmail.com>"
+ , "dependencies": {
+ "underscore": "1.4.2"
+ }
+ , "devDependencies": {
+ "should": "*"
+ }
+ , "main": "index"
+}
View
@@ -0,0 +1,33 @@
+/**
+ * Module dependencies.
+ */
+
+var kmeans = require('../')
+ , should = require('should')
+ ;
+
+/**
+ * Data source: LinkedIn.
+ */
+var data = [
+ {'company': 'Microsoft' , 'size': 91259, 'revenue': 60420},
+ {'company': 'IBM' , 'size': 400000, 'revenue': 98787},
+ {'company': 'Skype' , 'size': 700, 'revenue': 716},
+ {'company': 'SAP' , 'size': 48000, 'revenue': 11567},
+ {'company': 'Yahoo!' , 'size': 14000 , 'revenue': 6426 },
+ {'company': 'eBay' , 'size': 15000, 'revenue': 8700},
+];
+
+// Create the labels and the vectors describing the data
+var labels = new Array();
+var vectors = new Array();
+for (var i = 0 ; i < data.length ; i++) {
+ labels[i] = data[i]['company'] ;
+ vectors[i] = [ data[i]['size'] , data[i]['revenue']] ;
+};
+
+
+new kmeans.clusterize(vectors, {k: 4}, function(err,res) {
+ if (err) console.error(err);
+ else console.log('%o',res);
+});

0 comments on commit 4f1ab11

Please sign in to comment.