## Arguments
- db (default pmc)
- collection (default articles)
- skip_field (default none)
- extractors_to_use (default all)
- num_workers (default 4)

In [1]:
import os
os.chdir("..")

In [2]:
    import multiprocessing as mp
    import time
    import sys
    import pymongo
    # from annotator.keyword_annotator import KeywordAnnotator
    # from annotator.geoname_annotator import GeonameAnnotator
    import pubcrawler.extractors as ex

In [8]:
    def chunk_slices(length, by):
        items = list(range(0, length + 1, by))
        if length % by != 0:
            items.append(length)
        slices = [slice(items[i], items[i+1]) for i in range(0, len(items)-1)]
        return(slices)

    def worker(url, db, collection, to_extract, query, index_queue):
        articles = pymongo.MongoClient()[db][collection]
        for i in iter(index_queue.get, 'STOP'):
            print(i)
            article = articles.find(query)[i]
            to_write = ex.combine_extracted_info(article, to_extract)
            articles.update_one({'_id': article['_id']}, {'$set': to_write})

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-u", "--mongo_url", default="localhost", dest = "u"
    )
    parser.add_argument(
        "-d", "--mongo_db", default="pmc", dest = "d"
    )
    parser.add_argument(
        "-c", "--mongo_collection", default="articlesubset", dest = "c"
    )
    parser.add_argument(
        "-x", "-extract", action="append", default=None, dest = "x"
    )
    parser.add_argument(
        "-s", "-skip_field", default=None, dest = "s"
    )
    parser.add_argument(
        "-w", "-workers", default=4, dest = "w"
    )
    parser.add_argument(
        "-l", "-limit", default=None, dest = "l"
    )
    args = parser.parse_args(["-x", "extract_disease_ontology_keywords", "-s", "keywords", "-w", "8", "-c", "articlesubset"])
    args

    if args.x is not None:
        extractor_funs = [eval(x) for x in ['ex.' + x for x in args.x]]
    else:
        print("Please specify at least one extractor function", file=sys.stderr)
        sys.exit(1)

    if args.s is not None:
        query = {args.s: {'$exists': False}}
    else:
        query = {}

    print("Making connection.")
    articles = pymongo.MongoClient(args.u)[args.d][args.c]

    print("About to count.")
    total_for_query = articles.count(query)
    num_to_annotate = args.l if args.l is not None else total_for_query
    num_workers = int(args.w)
    print("Total for query is {}.".format(total_for_query))

    queue = mp.Queue()
    for i in range(num_to_annotate):
        queue.put(i)
    for w in range(num_workers):
        queue.put('STOP')

    # # Chunking, which we don't do any more.
    # queue = mp.Queue()
    # for i in chunk_slices(num_to_annotate, by = 100):
    #     queue.put(i)
    # for w in range(num_workers):
    #     queue.put('STOP')

    worker_args = (
        args.u,
        args.d,
        args.c,
        extractor_funs,
        query,
        queue,
    )

    print("About to start.")

Making connection.
About to count.
Total for query is 7753.
About to start.


In [None]:
    for w in range(num_workers):
        mp.Process(target=worker, args=worker_args).start()

In [None]:
articles.count(query)

In [None]:
query

In [None]:
import pubcrawler.extractors as extractors

In [None]:
extractors.extract_meta

In [None]:
eval('extract_meta')

In [None]:
time.time()

In [6]:
now = time.strftime("%Y-%m-%d %H:%M:%S")

In [10]:
str(query)

"{'keywords': {'$exists': False}}"

In [7]:
total_for_query = articles.count(query)
"At {}, query {} finds {} articles".format(now)

'foo2016-09-24 17:53:38'

In [11]:
total_for_query = articles.count(query)

In [12]:
total_for_query

7702