initial import

ThinkBigAnalytics · Apr 5, 2016 · a9fc30f · a9fc30f
1 parent ec64618
commit a9fc30f
Show file tree

Hide file tree

Showing 12 changed files with 1,116 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+# Compiled python modules.
+*.pyc
+
+# Setuptools distribution folder.
+/dist/
+
+# Python egg metadata, regenerated from source files by setuptools.
+/*.egg-info
+
+.DS_Store
+.ipynb_checkpoints/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (C) <year> by <copyright holders> and individual contributors.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+include README.rst
+include LICENSE
+include *.txt
+include docs/*.ipynb
diff --git a/README.md b/README.md
diff --git a/README.rst b/README.rst
@@ -0,0 +1,23 @@
+# pyspark-distributed-kmodes
+
+## Installing
+
+```
+$ pip install .
+```
+
+TODO: update README
+
+## Distributed K-modes for pySpark
+
+There is an example ipython notebook that shows how to run the K-modes calculation.
+
+This calculation depends on the pyspark_kmodes.py being in the Python import path.
+
+I have tested this with Kmodes.py in the working directory - it needs to be in a place that is accessible for export to the worker nodes.
+
+The two PDF files are the articles on which this approach is based - both the original and the distributed versions.  I've left them here for reference purposes, they are not important for functionality.
+
+TODO: add links to papers instead of PDFs?
+
+
diff --git a/docs/PySpark-Distributed-KModes-example.ipynb b/docs/PySpark-Distributed-KModes-example.ipynb
@@ -0,0 +1,174 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Distributed KModes demonstration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark_kmodes import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "## Arguments / Variables\n",
+    "n_modes = 2\n",
+    "set_partitions = 32\n",
+    "max_iter = 10\n",
+    "\n",
+    "\n",
+    "# Create the data set\n",
+    "import numpy as np\n",
+    "data = np.random.choice([\"a\", \"b\", \"c\"], (50000, 10))\n",
+    "data2 = np.random.choice([\"e\", \"f\", \"g\"], (50000, 10))\n",
+    "data = list(data) + list(data2)\n",
+    "\n",
+    "from random import shuffle\n",
+    "shuffle(data)\n",
+    "\n",
+    "# Create the rdd\n",
+    "rdd = sc.parallelize(data)\n",
+    "rdd = rdd.coalesce(set_partitions)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "method = EnsembleKModes(2, 10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Fit the model using PySpark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration  0\n",
+      "Iteration  1\n",
+      "Iteration  2\n"
+     ]
+    },
+    {
+     "ename": "ImportError",
+     "evalue": "No module named 'pyspark_kmodes.KModes'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-4-e6d11bcc954e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrdd\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/usr/local/lib/python3.5/site-packages/pyspark_kmodes/pyspark_kmodes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, rdd)\u001b[0m\n\u001b[1;32m    432\u001b[0m             \u001b[0;31m# Calculate the modes locally for the set of all modes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    433\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 434\u001b[0;31m             \u001b[0mlocal_clusters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrun_local_kmodes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclusters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_clusters\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    435\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbosity\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    436\u001b[0m                 \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Avg cost/partition:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlocal_clusters\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclusters\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.5/site-packages/pyspark_kmodes/pyspark_kmodes.py\u001b[0m in \u001b[0;36mrun_local_kmodes\u001b[0;34m(clusters, n_clusters, init, n_init, verbose)\u001b[0m\n\u001b[1;32m    286\u001b[0m             \u001b[0;34m-\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0moptional\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0mverbosity\u001b[0m \u001b[0mof\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdefault\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    287\u001b[0m     \"\"\"\n\u001b[0;32m--> 288\u001b[0;31m     \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mKModes\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mKmodes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    289\u001b[0m     \u001b[0;31m# Now do k-modes on the main machine\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    290\u001b[0m     \u001b[0mkm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKModes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_clusters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_clusters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_init\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_init\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mImportError\u001b[0m: No module named 'pyspark_kmodes.KModes'"
+     ]
+    }
+   ],
+   "source": [
+    "model = method.fit(rdd)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "print model.clusters\n",
+    "print method.mean_cost\n",
+    "print model.clusters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "predictions = method.predictions\n",
+    "datapoints = method.indexed_rdd\n",
+    "combined = datapoints.zip(predictions)\n",
+    "print combined.take(10)\n",
+    "       \n",
+    "model.predict(rdd).take(5)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model.predict(sc.parallelize(['e', 'e', 'f', 'e', 'e', 'f', 'g', 'e', 'f', 'e'])).collect()  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model.predict(rdd).take(5)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PySpark (Spark 1.6.1)",
+   "language": "python",
+   "name": "pyspark"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}