In [2]:
import sys;sys.path.append('..')
from ppanlp.corpus import *
from ppanlp.cleanup import *

NameError: name 'PATH_PPA_CORPUS' is not defined

In [7]:
def iter_json(fn):
    if os.path.exists(fn):
        if fn.endswith('.gz'):
            with gzip.open(fn, 'rt', encoding='UTF-8') as zipfile:
                yield from json.load(zipfile)
        else:
            with open(fn, 'r', encoding='UTF-8') as f:
                yield from json.load(f)

def write_json(obj, fn):
    os.makedirs(os.path.dirname(fn), exist_ok=True)
    if fn.endswith('.gz'):
        with gzip.open(fn, 'wt', encoding='UTF-8') as zipfile:
            json.dump(obj, zipfile)
    else:
        with open(fn, 'w', encoding='UTF-8') as of:
            json.dump(obj, of)

In [2]:
@cache
def PPA(srcdir=PATH_PPA_CORPUS): return PPACorpus(PATH_PPA_CORPUS)


class PPACorpus:
    WORK_ID_FIELD = 'id'

    def __init__(self, path:str, texts_dir='texts', metadata_fn='metadata.json', texts_preproc_dir='texts_preproc'):
        self.path = os.path.abspath(os.path.expanduser(path))
        self.path_texts = os.path.join(self.path,texts_dir) if not os.path.isabs(texts_dir) else texts_dir
        self.path_texts_preproc = os.path.join(self.path,texts_preproc_dir) if not os.path.isabs(texts_preproc_dir) else texts_preproc_dir
        self.path_metadata = os.path.join(self.path,metadata_fn) if not os.path.isabs(metadata_fn) else metadata_fn

    def __iter__(self): yield from self.iter_texts()

    @cached_property
    def meta(self):
        return pd.read_json(self.path_metadata).fillna('').set_index(self.WORK_ID_FIELD)
    
    @cache
    def get_text(self, work_id):
        return PPAText(work_id, corpus=self)

    def iter_texts(self, work_ids=None):
        if work_ids is None: work_ids=self.meta.index
        work_ids = tqdm(work_ids, position=1, desc='Iterating over texts in PPA')
        for work_id in work_ids:
            yield self.get_text(work_id)


class PPAText:
    FILE_ID_KEY='work_id'

    def __init__(self, id, clean=True, corpus=None):
        self.id=id
        self.corpus=corpus if corpus is not None else PPA()
        self.clean=clean

    def __iter__(self): yield from self.iter_pages()

    @cached_property
    def meta(self):
        return dict(self.corpus.meta.loc[self.id])

    @cached_property
    def path(self):
        return os.path.join(self.corpus.path_texts, self.meta[self.FILE_ID_KEY]+'.json')
    
    @cached_property
    def path_preproc(self):
        return os.path.join(self.corpus.path_texts_preproc, self.meta[self.FILE_ID_KEY]+'.json.gz')

    @cached_property
    def pages_df(self):
        return pd.DataFrame(self.iter_pages()).set_index('page_id')

    @cached_property
    def pages(self): 
        return list(self.iter_pages())
    @cached_property
    def pages_orig(self): 
        return list(self.iter_pages_orig())

    def clean_pages(self,remove_headers=True,force=False):
        if force or not os.path.exists(self.path_preproc):
            new_pages = cleanup_pages(self.pages_orig, remove_headers=remove_headers)
            write_json(new_pages, self.path_preproc)
    
    def iter_pages_orig(self):
        yield from iter_json(self.path)
    
    def iter_pages_preproc(self):
        self.clean_pages()
        yield from iter_json(self.path_preproc)

    def iter_pages(self, clean=None):
        clean = self.clean if clean==None else clean
        yield from self.iter_pages_preproc() if clean else self.iter_pages_orig()
        

In [3]:
# c = PPA()
# c.meta

In [4]:
t = PPAText('yale.39002032008188')
t.pages_df

Unnamed: 0_level_0,page_num,page_num_orig,page_corrections,page_text,page_text_orig,page_tokens
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
yale.39002032008188_1,1,1,"{'headers': [], 'linebreaks': [], 'long_s': []...",,,[]
yale.39002032008188_2,2,2,"{'headers': [], 'linebreaks': [], 'long_s': []...",YALE\nDIVINITY SCHOOL\nLIBRARY\n\nGift of\n\n,YALE\nDIVINITY SCHOOL\nLIBRARY\n\nGift of\n\n,"[yale, divinity, school, library, gift, of]"
yale.39002032008188_3,3,3,"{'headers': [], 'linebreaks': [], 'long_s': []...",,,[]
yale.39002032008188_4,4,4,"{'headers': [], 'linebreaks': [], 'long_s': []...",,,[]
yale.39002032008188_5,5,5,"{'headers': [], 'linebreaks': [], 'long_s': []...",,,[]
...,...,...,...,...,...,...
yale.39002032008188_322,322,322,"{'headers': [], 'linebreaks': [], 'long_s': []...",,,[]
yale.39002032008188_323,323,323,"{'headers': [], 'linebreaks': [], 'long_s': []...",,,[]
yale.39002032008188_324,324,324,"{'headers': [], 'linebreaks': [], 'long_s': []...",,,[]
yale.39002032008188_325,325,325,"{'headers': [], 'linebreaks': [], 'long_s': []...",3 9002 03200 8188\n\n,3 9002 03200 8188\n\n,[]


In [5]:
t.clean_pages()