In [1]:
"""
Basic indexing and searching example adapted from http://lucene.apache.org/core/7_2_0/core/index.html
"""
INDEX_DIR = "IndexFiles.index"

import sys, os, lucene, time, threading, unicodedata, re, codecs
from zipfile import ZipFile
from datetime import datetime

from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import SimpleFSDirectory

class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

            
class IndexFiles(object):
    """Usage: python IndexFiles <doc_directory>"""

    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        
        store = SimpleFSDirectory(Paths.get(storeDir))

        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print("commit index")
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print("done")

    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
        
        wikiFile = ZipFile(root, 'r')
        files = wikiFile.namelist()
        
        i = 0
        for file in files[1:]:
            i += 1
            wiki = wikiFile.open(file,'r')
            for line in wiki:
                for line in codecs.iterdecode(wiki, 'utf8'):
                    normailized = unicodedata.normalize('NFD', line).split(' ', 2)
                    if not normailized[1].isdigit(): continue
                    docname = normailized[0] + ' ' + normailized[1]
                    name = re.sub(r'[^a-zA-Z0-9]', ' ', normailized[0])
                    contents = normailized[2]
                    doc = Document()
                    doc.add(Field('docname', docname, t1))
                    doc.add(Field('name', name, t1))
                    doc.add(Field('contents', contents, t1))
                    writer.addDocument(doc)
            print('File %d done indexing' % i, file)
            
if __name__ == '__main__':
    assert lucene.getVMEnv() or lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print('lucene', lucene.VERSION)
    start = datetime.now()
    try:
        base_dir = os.getcwd()
        IndexFiles('wiki-pages-text.zip', os.path.join(base_dir, INDEX_DIR), StandardAnalyzer())
        end = datetime.now()
        print(end - start)
    except Exception as e:
        print("Failed: ", str(e))

lucene 7.7.1
File 1 done indexing wiki-pages-text/wiki-009.txt
File 2 done indexing wiki-pages-text/wiki-021.txt
File 3 done indexing wiki-pages-text/wiki-035.txt
File 4 done indexing wiki-pages-text/wiki-034.txt
File 5 done indexing wiki-pages-text/wiki-020.txt
File 6 done indexing wiki-pages-text/wiki-008.txt
File 7 done indexing wiki-pages-text/wiki-036.txt
File 8 done indexing wiki-pages-text/wiki-022.txt
File 9 done indexing wiki-pages-text/wiki-023.txt
File 10 done indexing wiki-pages-text/wiki-037.txt
File 11 done indexing wiki-pages-text/wiki-033.txt
File 12 done indexing wiki-pages-text/wiki-027.txt
File 13 done indexing wiki-pages-text/wiki-026.txt
File 14 done indexing wiki-pages-text/wiki-032.txt
File 15 done indexing wiki-pages-text/wiki-024.txt
File 16 done indexing wiki-pages-text/wiki-030.txt
File 17 done indexing wiki-pages-text/wiki-018.txt
File 18 done indexing wiki-pages-text/wiki-019.txt
File 19 done indexing wiki-pages-text/wiki-031.txt
File 20 done indexing wiki-