# concurrent tokenization
based on https://explosion.ai/blog/multithreading-with-cython and https://github.com/explosion/spaCy/issues/172 perform concurrent tokenization 

## a minimal example

In [12]:
import itertools
# import en_core_web_md
import os
import spacy
from bs4 import BeautifulSoup
import pprint
import codecs

# nlp = en_core_web_md.load()
nlp = spacy.load('en')
#nlp = spacy.load('en_core_web_md') # this one did not have proper stopwords

In [13]:
texts = [u'One document.', u'...', u'Lots of documents']
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in range(100000000))

In [14]:
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
    assert doc.is_parsed
    if i == 300000:
        break
    #print(i)
    #print(doc)

In [15]:
def gen_items():
    print("Yield 0")
    yield (0, 'Text 0')
    print("Yield 1")
    yield (1, 'text 1')
    print("Yield 2")
    yield (2, 'Text 2')

gen1, gen2 = itertools.tee(gen_items())
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts, batch_size=50, n_threads=4)
d = {}
for id_, doc in zip(ids, docs):
    print('id ' + str(id_))
    for token in doc:
       # print('token ' + str(token) + ' orth ' + token.orth_)
        if token.is_alpha and not token.is_stop and len(token.orth_) > 1:
            strtok = token.orth_.strip()
            if strtok not in d.keys():
                d[strtok] = {id_}
            elif strtok in d.keys():
                d[strtok].add(id_)

d

Yield 0
Yield 1
Yield 2
id 0
id 1
id 2


{'Text': {0, 2}, 'text': {1}}

## application to our context
now apply this to our minimal example when iterating with a nice generator function

> TODO we have around 3 different types of documents. Understand which ones are required an when to use what type of document.

In [16]:
numer_of_files = 1
for root, dirs, files in os.walk("data/TREC8all/Adhoc"):
        for file in files:
            if not file.startswith('.'):
                numer_of_files +=1  
numer_of_files

2295

In [17]:
def iter_all_files(p):
    for root, dirs, files in os.walk(p):
        for file in files:
            if not file.startswith('.'):
                print('using: ' + str(os.path.join(root, file)))
                yield os.path.join(root, file)
    
g = iter_all_files("data/TREC8all/Adhoc")
[next(g) for _ in range(3)]

using: data/TREC8all/Adhoc/fr94/06/fr9406210
using: data/TREC8all/Adhoc/fr94/06/fr9406220
using: data/TREC8all/Adhoc/fr94/06/fr9406100


['data/TREC8all/Adhoc/fr94/06/fr9406210',
 'data/TREC8all/Adhoc/fr94/06/fr9406220',
 'data/TREC8all/Adhoc/fr94/06/fr9406100']

now we need to parse the path and document ID into a second generator which later on can be zipped with the tokenized documents.

some additional stopwords are added below.

In [18]:
nlp.vocab["-PRON-"].is_stop = True

In [19]:
def iter_and_parse_all_files(p):
    for root, dirs, files in os.walk(p):
        for file in files:
            if not file.startswith('.'):
                print('using: ' + str(os.path.join(root, file)))
                path = os.path.join(root, file)
                text_file = codecs.open(path, 'r', "iso-8859-1").read()
                soup = BeautifulSoup(text_file,'lxml')
                for doc in soup.find_all("doc"):
                    strdoc = doc.docno.string.strip()
                    text_only = str(doc.find_all("text")[0])
                    # print("Yield id: "+ strdoc)
                    yield (strdoc, text_only)

In [20]:
def gen_items(path):
    path = next(path)
    # print(path)
    text_file = codecs.open(path, 'r', "iso-8859-1").read()
    soup = BeautifulSoup(text_file,'lxml')
    for doc in soup.find_all("doc"):
        strdoc = doc.docno.string.strip()
        text_only = str(doc.find_all("text")[0])
        # print("Yield id: "+ strdoc)
        yield (strdoc, text_only)

In [21]:
gen1, gen2 = itertools.tee(gen_items(g))
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts, batch_size=50, n_threads=4)

In [22]:
(id, doc_token) = next(zip(ids, docs))

using: data/TREC8all/Adhoc/fr94/06/fr9406031


## handling of stopwords
basically standard stop words should be handled automatically. To add new ones follow https://github.com/explosion/spaCy/issues/226
Let's check this.

We will only use the text tag as a starter. Certainly this could be expanded in the future to use the header (h3, summary) as well.

spaCy already filters a lot of standard english and german stopwords. To add additional stopwords we will handle it as outlined below. Generally, we will filter for only alphanumeric tokens with a length > 1 to be indexed. But additionally, some stopwords were added manually.

Currently, a lot of non alphanumeric tokens show up as well.

- http://stackoverflow.com/questions/41170726/add-remove-stop-words-with-spacy
- https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
    - from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
```
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'v
```
- `nlp.vocab["definitelynotastopword"].is_stop = True`

In [24]:
# spacy.en.STOP_WORDS

In [25]:
d = {}
print('document id: ' + id)
for token in doc_token:
    if token.is_alpha:
        if not token.is_stop:
            if len(token.orth_) > 1:
                #print("original:", token.orth_, token.orth)
                #print("x:", token.)
                #print("lemma:", token.lemma_, token.lemma)
                #print("------------------------")
            
                strtok = token.lemma_.strip()
                if strtok not in d.keys():
                    d[strtok] = {id}
                elif strtok in d.keys():
                    d[strtok].add(id)

document id: FR940603-1-00001


In [26]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(d)

{   'federal': {'FR940603-1-00001'},
    'friday': {'FR940603-1-00001'},
    'frnewline': {'FR940603-1-00001'},
    'ftag': {'FR940603-1-00001'},
    'itag': {'FR940603-1-00001'},
    'june': {'FR940603-1-00001'},
    'pjg': {'FR940603-1-00001'},
    'register': {'FR940603-1-00001'},
    'rule': {'FR940603-1-00001'},
    'stag': {'FR940603-1-00001'},
    'text': {'FR940603-1-00001'},
    'vol': {'FR940603-1-00001'}}


> TODO
    - remove -PRON-
    - use concurrency, currently it still is just using a single core

In [27]:
file_counter = 0
d = {}

gen1, gen2 = itertools.tee(iter_and_parse_all_files("data/TREC8all/Adhoc"))
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts, batch_size=100, n_threads=6)

# take only first k items to test it now
# for id_, doc in itertools.islice(zip(ids, docs), 2):
for id_, doc in zip(ids, docs):
    file_counter += 1
    # print('document id: ' + id_)
    for token in doc:
        if token.is_alpha and not token.is_stop and len(token.orth_) > 1:
            strtok = token.lemma_.strip()
            if strtok not in d.keys():
                d[strtok] = {id_: 1}
            elif strtok in d.keys():
                if id_ in d[strtok].keys():
                    d[strtok][id_] = d[strtok][id_] + 1
                else:
                    d[strtok][id_] = 1
    if file_counter == 100:
        break
                        
file_counter

using: data/TREC8all/Adhoc/fr94/06/fr9406210
using: data/TREC8all/Adhoc/fr94/06/fr9406220


100

In [11]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(d)

{   'abandon': {'FR940621-0-00011': 2, 'FR940621-0-00031': 5},
    'abandonment': {'FR940621-0-00011': 2, 'FR940621-0-00031': 1},
    'abatement': {   'FR940621-0-00014': 4,
                     'FR940621-0-00015': 5,
                     'FR940621-0-00016': 3,
                     'FR940621-0-00017': 3,
                     'FR940621-0-00018': 6,
                     'FR940621-0-00019': 12},
    'abdominal': {'FR940621-0-00031': 4},
    'aberration': {'FR940622-0-00053': 1},
    'ability': {   'FR940621-0-00005': 1,
                   'FR940621-0-00034': 1,
                   'FR940622-0-00039': 2,
                   'FR940622-0-00048': 1,
                   'FR940622-0-00055': 1},
    'able': {   'FR940621-0-00031': 2,
                'FR940621-0-00036': 1,
                'FR940622-0-00010': 1,
                'FR940622-0-00021': 1,
                'FR940622-0-00035': 1,
                'FR940622-0-00036': 1,
                'FR940622-0-00048': 1},
    'aboard': {'FR940621-0-00031':

persistence of created index

In [8]:
import pickle
pickle.dump( d, open( "savedic.p", "wb" ) )
d2 = pickle.load( open( "savedic.p", "rb" ) )

skiplist https://github.com/ZhukovAlexander/py-skiplist