Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 3725dfa
Showing
8 changed files
with
1,666 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
============================================== | ||
gensim -- Python Framework for Topic Modelling | ||
============================================== | ||
|
||
|
||
|
||
Gensim is a Python library for *Vector Space Modelling* with very large corpora. | ||
Target audience is the *Natural Language Processing* (NLP) community. | ||
|
||
|
||
Features | ||
--------- | ||
|
||
* All algorithms are **memory-independent** w.r.t. the corpus size (can process input larger than RAM), | ||
* **Intuitive interfaces** | ||
|
||
* easy to plug in your own input corpus/datastream (trivial streaming API) | ||
* easy to extend with other Vector Space algorithms (trivial transformation API) | ||
|
||
* Efficient implementations of popular algorithms, such as online **Latent Semantic Analysis**, | ||
**Latent Dirichlet Allocation** or **Random Projections** | ||
* **Distributed computing**: can run *Latent Semantic Analysis* and *Latent Dirichlet Allocation* on a cluster of computers. | ||
* Extensive `HTML documentation and tutorials <http://radimrehurek.com/gensim/>`_. | ||
|
||
|
||
If this feature list left you scratching your head, you can first read more about the `Vector | ||
Space Model <http://en.wikipedia.org/wiki/Vector_space_model>`_ and `unsupervised | ||
document analysis <http://en.wikipedia.org/wiki/Latent_semantic_indexing>`_ on Wikipedia. | ||
|
||
Installation | ||
------------ | ||
|
||
This software depends on `NumPy and Scipy <http://www.scipy.org/Download>`_, two Python packages for scientific computing. | ||
You must have them installed prior to installing `gensim`. | ||
|
||
The simple way to install `gensim` is:: | ||
|
||
sudo easy_install gensim | ||
|
||
Or, if you have instead downloaded and unzipped the `source tar.gz <http://pypi.python.org/pypi/gensim>`_ package, | ||
you'll need to run:: | ||
|
||
python setup.py test | ||
sudo python setup.py install | ||
|
||
|
||
For alternative modes of installation (without root priviledges, development | ||
installation, optional install features), see the `documentation <http://radimrehurek.com/gensim/install.html>`_. | ||
|
||
This version has been tested under Python 2.5 and 2.6, but should run on any 2.5 <= Python < 3.0. | ||
|
||
Documentation | ||
------------- | ||
|
||
Manual for the gensim package is available in `HTML <http://radimrehurek.com/gensim/>`_. It | ||
contains a walk-through of all its features and a complete reference section. | ||
It is also included in the source distribution package. | ||
|
||
---------------- | ||
|
||
Gensim is open source software, and has been released under the | ||
`GNU LGPL license <http://www.gnu.org/licenses/lgpl.html>`_. | ||
Copyright (c) 2011 Radim Rehurek |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
#!python | ||
"""Bootstrap setuptools installation | ||
If you want to use setuptools in your package's setup.py, just include this | ||
file in the same directory with it, and add this to the top of your setup.py:: | ||
from ez_setup import use_setuptools | ||
use_setuptools() | ||
If you want to require a specific version of setuptools, set a download | ||
mirror, or use an alternate download directory, you can do so by supplying | ||
the appropriate options to ``use_setuptools()``. | ||
This file can also be run as a script to install or upgrade setuptools. | ||
""" | ||
import sys | ||
DEFAULT_VERSION = "0.6c11" | ||
DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] | ||
|
||
md5_data = { | ||
'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', | ||
'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', | ||
'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', | ||
'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', | ||
'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', | ||
'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', | ||
'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', | ||
'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', | ||
'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', | ||
'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', | ||
'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090', | ||
'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4', | ||
'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7', | ||
'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5', | ||
'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de', | ||
'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b', | ||
'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2', | ||
'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086', | ||
'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', | ||
'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', | ||
'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', | ||
'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', | ||
'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', | ||
'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', | ||
'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', | ||
'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', | ||
'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', | ||
'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', | ||
'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', | ||
'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', | ||
'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', | ||
'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', | ||
'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', | ||
'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', | ||
'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', | ||
'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', | ||
'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', | ||
'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', | ||
'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', | ||
'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', | ||
'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', | ||
'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', | ||
} | ||
|
||
import sys, os | ||
try: from hashlib import md5 | ||
except ImportError: from md5 import md5 | ||
|
||
def _validate_md5(egg_name, data): | ||
if egg_name in md5_data: | ||
digest = md5(data).hexdigest() | ||
if digest != md5_data[egg_name]: | ||
print >>sys.stderr, ( | ||
"md5 validation of %s failed! (Possible download problem?)" | ||
% egg_name | ||
) | ||
sys.exit(2) | ||
return data | ||
|
||
def use_setuptools( | ||
version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, | ||
download_delay=15 | ||
): | ||
"""Automatically find/download setuptools and make it available on sys.path | ||
`version` should be a valid setuptools version number that is available | ||
as an egg for download under the `download_base` URL (which should end with | ||
a '/'). `to_dir` is the directory where setuptools will be downloaded, if | ||
it is not already available. If `download_delay` is specified, it should | ||
be the number of seconds that will be paused before initiating a download, | ||
should one be required. If an older version of setuptools is installed, | ||
this routine will print a message to ``sys.stderr`` and raise SystemExit in | ||
an attempt to abort the calling script. | ||
""" | ||
was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules | ||
def do_download(): | ||
egg = download_setuptools(version, download_base, to_dir, download_delay) | ||
sys.path.insert(0, egg) | ||
import setuptools; setuptools.bootstrap_install_from = egg | ||
try: | ||
import pkg_resources | ||
except ImportError: | ||
return do_download() | ||
try: | ||
pkg_resources.require("setuptools>="+version); return | ||
except pkg_resources.VersionConflict, e: | ||
if was_imported: | ||
print >>sys.stderr, ( | ||
"The required version of setuptools (>=%s) is not available, and\n" | ||
"can't be installed while this script is running. Please install\n" | ||
" a more recent version first, using 'easy_install -U setuptools'." | ||
"\n\n(Currently using %r)" | ||
) % (version, e.args[0]) | ||
sys.exit(2) | ||
else: | ||
del pkg_resources, sys.modules['pkg_resources'] # reload ok | ||
return do_download() | ||
except pkg_resources.DistributionNotFound: | ||
return do_download() | ||
|
||
def download_setuptools( | ||
version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, | ||
delay = 15 | ||
): | ||
"""Download setuptools from a specified location and return its filename | ||
`version` should be a valid setuptools version number that is available | ||
as an egg for download under the `download_base` URL (which should end | ||
with a '/'). `to_dir` is the directory where the egg will be downloaded. | ||
`delay` is the number of seconds to pause before an actual download attempt. | ||
""" | ||
import urllib2, shutil | ||
egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) | ||
url = download_base + egg_name | ||
saveto = os.path.join(to_dir, egg_name) | ||
src = dst = None | ||
if not os.path.exists(saveto): # Avoid repeated downloads | ||
try: | ||
from distutils import log | ||
if delay: | ||
log.warn(""" | ||
--------------------------------------------------------------------------- | ||
This script requires setuptools version %s to run (even to display | ||
help). I will attempt to download it for you (from | ||
%s), but | ||
you may need to enable firewall access for this script first. | ||
I will start the download in %d seconds. | ||
(Note: if this machine does not have network access, please obtain the file | ||
%s | ||
and place it in this directory before rerunning this script.) | ||
---------------------------------------------------------------------------""", | ||
version, download_base, delay, url | ||
); from time import sleep; sleep(delay) | ||
log.warn("Downloading %s", url) | ||
src = urllib2.urlopen(url) | ||
# Read/write all in one block, so we don't create a corrupt file | ||
# if the download is interrupted. | ||
data = _validate_md5(egg_name, src.read()) | ||
dst = open(saveto,"wb"); dst.write(data) | ||
finally: | ||
if src: src.close() | ||
if dst: dst.close() | ||
return os.path.realpath(saveto) | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def main(argv, version=DEFAULT_VERSION): | ||
"""Install or upgrade setuptools and EasyInstall""" | ||
try: | ||
import setuptools | ||
except ImportError: | ||
egg = None | ||
try: | ||
egg = download_setuptools(version, delay=0) | ||
sys.path.insert(0,egg) | ||
from setuptools.command.easy_install import main | ||
return main(list(argv)+[egg]) # we're done here | ||
finally: | ||
if egg and os.path.exists(egg): | ||
os.unlink(egg) | ||
else: | ||
if setuptools.__version__ == '0.0.1': | ||
print >>sys.stderr, ( | ||
"You have an obsolete version of setuptools installed. Please\n" | ||
"remove it from your system entirely before rerunning this script." | ||
) | ||
sys.exit(2) | ||
|
||
req = "setuptools>="+version | ||
import pkg_resources | ||
try: | ||
pkg_resources.require(req) | ||
except pkg_resources.VersionConflict: | ||
try: | ||
from setuptools.command.easy_install import main | ||
except ImportError: | ||
from easy_install import main | ||
main(list(argv)+[download_setuptools(delay=0)]) | ||
sys.exit(0) # try to force an exit | ||
else: | ||
if argv: | ||
from setuptools.command.easy_install import main | ||
main(argv) | ||
else: | ||
print "Setuptools version",version,"or greater has been installed." | ||
print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' | ||
|
||
def update_md5(filenames): | ||
"""Update our built-in md5 registry""" | ||
|
||
import re | ||
|
||
for name in filenames: | ||
base = os.path.basename(name) | ||
f = open(name,'rb') | ||
md5_data[base] = md5(f.read()).hexdigest() | ||
f.close() | ||
|
||
data = [" %r: %r,\n" % it for it in md5_data.items()] | ||
data.sort() | ||
repl = "".join(data) | ||
|
||
import inspect | ||
srcfile = inspect.getsourcefile(sys.modules[__name__]) | ||
f = open(srcfile, 'rb'); src = f.read(); f.close() | ||
|
||
match = re.search("\nmd5_data = {\n([^}]+)}", src) | ||
if not match: | ||
print >>sys.stderr, "Internal error!" | ||
sys.exit(2) | ||
|
||
src = src[:match.start(1)] + repl + src[match.end(1):] | ||
f = open(srcfile,'w') | ||
f.write(src) | ||
f.close() | ||
|
||
|
||
if __name__=='__main__': | ||
if len(sys.argv)>2 and sys.argv[1]=='--md5update': | ||
update_md5(sys.argv[2:]) | ||
else: | ||
main(sys.argv[1:]) | ||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.