Skip to content

Commit

Permalink
Refactor all python code by PEP8. Partially fix #1521 (#1550)
Browse files Browse the repository at this point in the history
* gensim dir PEP8 fixes

* corpora dir PEP8 fixes

* example dir PEP8 fixes

* model/wrapper dir PEP8 fixes

* models dir PEP8 fixes

* parsing dir PEP8 fixes

* scripts dir PEP8 fixes

* similarities dir PEP8 fixes

* summarization and topic_coherence dir PEP8 fixes

* test dir PEP8 fixes

* PEP8 E722 error fixes

* PEP8 fixes

* list slice whitespace PEP8 fixes

* disassemble import *

* Fix symlink

* fix symlink

* fix make_wiki_lemma file

* Replace relative import to absolute

* fix typo

* fix E203 error
  • Loading branch information
zsef123 authored and menshikh-iv committed Sep 5, 2017
1 parent 13578d4 commit 6d6f5dc
Show file tree
Hide file tree
Showing 107 changed files with 799 additions and 799 deletions.
31 changes: 27 additions & 4 deletions ez_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,25 @@
DEFAULT_VERSION = "1.3.2"
DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/"


def _python_cmd(*args):
args = (sys.executable,) + args
return subprocess.call(args) == 0


def _check_call_py24(cmd, *args, **kwargs):
res = subprocess.call(cmd, *args, **kwargs)

class CalledProcessError(Exception):
pass
if not res == 0:
msg = "Command '%s' return non-zero exit status %d" % (cmd, res)
raise CalledProcessError(msg)


vars(subprocess).setdefault('check_call', _check_call_py24)


def _install(tarball, install_args=()):
# extracting the tarball
tmpdir = tempfile.mkdtemp()
Expand Down Expand Up @@ -151,6 +157,7 @@ def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
return _do_download(version, download_base, to_dir,
download_delay)


def _clean_check(cmd, target):
"""
Run the command to download target. If the command fails, clean up before
Expand All @@ -163,6 +170,7 @@ def _clean_check(cmd, target):
os.unlink(target)
raise


def download_file_powershell(url, target):
"""
Download the file at url to target using Powershell (which will validate
Expand All @@ -176,6 +184,7 @@ def download_file_powershell(url, target):
]
_clean_check(cmd, target)


def has_powershell():
if platform.system() != 'Windows':
return False
Expand All @@ -184,50 +193,58 @@ def has_powershell():
try:
try:
subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
except:
except Exception:
return False
finally:
devnull.close()
return True


download_file_powershell.viable = has_powershell


def download_file_curl(url, target):
cmd = ['curl', url, '--silent', '--output', target]
_clean_check(cmd, target)


def has_curl():
cmd = ['curl', '--version']
devnull = open(os.path.devnull, 'wb')
try:
try:
subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
except:
except Exception:
return False
finally:
devnull.close()
return True


download_file_curl.viable = has_curl


def download_file_wget(url, target):
cmd = ['wget', url, '--quiet', '--output-document', target]
_clean_check(cmd, target)


def has_wget():
cmd = ['wget', '--version']
devnull = open(os.path.devnull, 'wb')
try:
try:
subprocess.check_call(cmd, stdout=devnull, stderr=devnull)
except:
except Exception:
return False
finally:
devnull.close()
return True


download_file_wget.viable = has_wget


def download_file_insecure(url, target):
"""
Use Python to download the file, even though it cannot authenticate the
Expand All @@ -251,8 +268,10 @@ def download_file_insecure(url, target):
if dst:
dst.close()


download_file_insecure.viable = lambda: True


def get_best_downloader():
downloaders = [
download_file_powershell,
Expand All @@ -265,6 +284,7 @@ def get_best_downloader():
if dl.viable():
return dl


def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
to_dir=os.curdir, delay=15,
downloader_factory=get_best_downloader):
Expand Down Expand Up @@ -317,7 +337,7 @@ def _extractall(self, path=".", members=None):
# Reverse sort directories.
if sys.version_info < (2, 4):
def sorter(dir1, dir2):
return cmp(dir1.name, dir2.name)
return cmp(dir1.name, dir2.name) # noqa:F821
directories.sort(sorter)
directories.reverse()
else:
Expand Down Expand Up @@ -350,6 +370,7 @@ def _build_install_args(options):
install_args.append('--user')
return install_args


def _parse_args():
"""
Parse the command line for options
Expand All @@ -371,12 +392,14 @@ def _parse_args():
# positional arguments are ignored
return options


def main(version=DEFAULT_VERSION):
"""Install or upgrade setuptools and EasyInstall"""
options = _parse_args()
tarball = download_setuptools(download_base=options.download_base,
downloader_factory=options.downloader_factory)
return _install(tarball, _build_install_args(options))


if __name__ == '__main__':
sys.exit(main())
7 changes: 5 additions & 2 deletions gensim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
similarities within a corpus of documents.
"""

from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization
from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization # noqa:F401
import logging

__version__ = '2.3.0'


class NullHandler(logging.Handler):
"""For python versions <= 2.6; same as `logging.NullHandler` in 2.7."""

def emit(self, record):
pass


logger = logging.getLogger('gensim')
if len(logger.handlers) == 0: # To ensure reload() doesn't add another one
if len(logger.handlers) == 0: # To ensure reload() doesn't add another one
logger.addHandler(NullHandler())
22 changes: 11 additions & 11 deletions gensim/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
"""

# bring corpus classes directly into package namespace, to save some typing
from .indexedcorpus import IndexedCorpus # must appear before the other classes
from .indexedcorpus import IndexedCorpus # noqa:F401 must appear before the other classes

from .mmcorpus import MmCorpus
from .bleicorpus import BleiCorpus
from .svmlightcorpus import SvmLightCorpus
from .lowcorpus import LowCorpus
from .dictionary import Dictionary
from .hashdictionary import HashDictionary
from .wikicorpus import WikiCorpus
from .textcorpus import TextCorpus
from .ucicorpus import UciCorpus
from .malletcorpus import MalletCorpus
from .mmcorpus import MmCorpus # noqa:F401
from .bleicorpus import BleiCorpus # noqa:F401
from .svmlightcorpus import SvmLightCorpus # noqa:F401
from .lowcorpus import LowCorpus # noqa:F401
from .dictionary import Dictionary # noqa:F401
from .hashdictionary import HashDictionary # noqa:F401
from .wikicorpus import WikiCorpus # noqa:F401
from .textcorpus import TextCorpus # noqa:F401
from .ucicorpus import UciCorpus # noqa:F401
from .malletcorpus import MalletCorpus # noqa:F401
2 changes: 1 addition & 1 deletion gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from os import path
import logging

from gensim import interfaces, utils
from gensim import utils
from gensim.corpora import IndexedCorpus
from six.moves import xrange

Expand Down
4 changes: 2 additions & 2 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,14 +333,14 @@ def merge_with(self, other):
old2new[other_id] = new_id
try:
self.dfs[new_id] += other.dfs[other_id]
except:
except Exception:
# `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going
pass
try:
self.num_docs += other.num_docs
self.num_nnz += other.num_nnz
self.num_pos += other.num_pos
except:
except Exception:
pass

import gensim.models
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/hashdictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def add_documents(self, documents):
for docno, document in enumerate(documents):
if docno % 10000 == 0:
logger.info("adding document #%i to %s" % (docno, self))
_ = self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
logger.info(
"built %s from %i documents (total %i corpus positions)",
self, self.num_docs, self.num_pos)
Expand Down
3 changes: 1 addition & 2 deletions gensim/corpora/indexedcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self, fname, index_fname=None):
# change self.index into a numpy.ndarray to support fancy indexing
self.index = numpy.asarray(self.index)
logger.info("loaded corpus index from %s" % index_fname)
except:
except Exception:
self.index = None
self.length = None

Expand Down Expand Up @@ -130,5 +130,4 @@ def __getitem__(self, docno):
raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')



# endclass IndexedCorpus
14 changes: 7 additions & 7 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,24 +65,24 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s" % fname)

self.fname = fname # input file, see class doc for format
self.line2words = line2words # how to translate lines into words (simply split on space by default)
self.fname = fname # input file, see class doc for format
self.line2words = line2words # how to translate lines into words (simply split on space by default)
self.num_docs = self._calculate_num_docs()

if not id2word:
# build a list of all word types in the corpus (distinct words)
logger.info("extracting vocabulary from the corpus")
all_terms = set()
self.use_wordids = False # return documents as (word, wordCount) 2-tuples
self.use_wordids = False # return documents as (word, wordCount) 2-tuples
for doc in self:
all_terms.update(word for word, wordCnt in doc)
all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
else:
logger.info("using provided word mapping (%i ids)" % len(id2word))
self.id2word = id2word
self.num_terms = len(self.word2id)
self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples
self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples

logger.info("loaded corpus with %i documents and %i terms from %s" %
(self.num_docs, self.num_terms, fname))
Expand Down Expand Up @@ -135,7 +135,7 @@ def __iter__(self):
"""
with utils.smart_open(self.fname) as fin:
for lineno, line in enumerate(fin):
if lineno > 0: # ignore the first line = number of documents
if lineno > 0: # ignore the first line = number of documents
yield self.line2doc(line)

@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def __iter__(self):
yield self.line2doc(line)

def line2doc(self, line):
l = [word for word in utils.to_unicode(line).strip().split(' ') if word]
docid, doclang, words = l[0], l[1], l[2:]
splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:]

doc = super(MalletCorpus, self).line2doc(' '.join(words))

Expand Down
3 changes: 2 additions & 1 deletion gensim/corpora/mmcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import logging

from gensim import interfaces, matutils
from gensim import matutils
from gensim.corpora import IndexedCorpus


Expand All @@ -23,6 +23,7 @@ class MmCorpus(matutils.MmReader, IndexedCorpus):
"""
Corpus in the Matrix Market format.
"""

def __init__(self, fname):
# avoid calling super(), too confusing
IndexedCorpus.__init__(self, fname)
Expand Down
Loading

0 comments on commit 6d6f5dc

Please sign in to comment.