In [1]:
# !pip install -U botok -q
# !pip install -U openpecha -q

In [2]:
import csv
from pathlib import Path
from tqdm import tqdm

from botok import WordTokenizer, sentence_tokenizer
from openpecha.corpus.download import download_corpus

In [3]:
def _mkdir(path: Path) -> Path:
  path.mkdir(exist_ok=True, parents=True)
  return path

BASE_PATH = Path.home() / ".models"
DATA_PATH = _mkdir(BASE_PATH / "data")

In [4]:
import openpecha; openpecha.__version__

'0.8.8'

## Download Corpus

In [5]:
corpus_path = DATA_PATH / "literary_bo"
corpus_path.mkdir(exist_ok=True, parents=True)

In [6]:
# download_corpus("literary_bo", output_path=DATA_PATH)

## Tokenize corpus sentences

In [7]:
wt = WordTokenizer()

Loading Trie... (2s.)


In [8]:
def sent_tokenize(text):
    tokens = wt.tokenize(text)
    # import pdb; pdb.set_trace()
    sentences = sentence_tokenizer(tokens)
    return [[token.text for token in sentence["tokens"]] for sentence in sentences]

In [9]:
def tokenize(text: str):
  sentences = sent_tokenize(text)
  sents_str = ""
  for sentence in sentences:
    # import pdb; pdb.set_trace()
    sents_str += " ".join([token.replace(" ", "_") for token in sentence]) + "\n"
  return sents_str

In [10]:
def get_text_from_tsv_file(fn):
    def handle_punct(token):
        if token:
            if '།' ==  token:
                return token + ' '
            else:
                return token
        else:
            return ' '

    with fn.open() as file:
        tsv_reader = csv.reader(file, delimiter="\t")
        next(tsv_reader, None) # skip header
        tokens = [handle_punct(token) for token, *row in tsv_reader if row]
    text = "".join(tokens)
    text = text.replace(' ། ', ' །')
    text = text.replace('  །', ' །')  
    return text

In [11]:
# test_fn = corpus_path / "PC7518B4E" / "Magnetizing (Wangdü) Prayer.tsv"
# assert test_fn.is_file()
# text = get_text_from_tsv_file(test_fn)
# print(text)
# " ".join([t.text.strip() for t in wt.tokenize(text)])

In [12]:
def tokenize_corpus(path, replace=False):
  for pecha_path in tqdm(list(path.iterdir())):
    for fn in pecha_path.iterdir():
      fn_tokenized = fn.parent / f"{fn.stem}.txt"
      if fn.name == "README.md": continue
      if fn_tokenized.is_file() and not replace:
        continue
      text = get_text_from_tsv_file(fn)
      if not text: continue
      sents_str = tokenize(text)
      fn_tokenized.write_text(sents_str)

In [13]:
tokenize_corpus(corpus_path)

100%|██████████| 729/729 [00:00<00:00, 734.69it/s]


## Remove Ocred Pecha

### ocred pecha list

In [14]:
ocred_pechas = """
P01F21F5A
P026D4391
P0697AF99
P06BF0616
P0788F8B8
P08383808
P0A443473
P0AC134F0
P0AEFD209
P0B226B66
P0DAA54A8
P0E1010CD
P0E807A82
P0F30A079
P1091119C
P10B76D16
P110BD211
P113161E2
P113F7BC8
P144985F2
P14720CB9
P155F5649
P16118F66
P17CC7922
P1864259A
P188A585C
P198795E5
P19E24C98
P1A2DE57C
P1A43637D
P1AF6985A
P1C88EEF8
P1C9C94AB
P1D9555BD
P2067CEA6
P20BBBE6B
P21925A26
P2239AE39
P22666E63
P24075444
P2491BBDB
P2527D757
P254DAAC1
P26D4E88A
P26EDC783
P27AB7411
P282804D9
P283BC233
P287F972E
P29496699
P294BAA75
P29636E09
P29D39AC1
P29EA4909
P2C2E1541
P2C9BB63E
P2CE84FFA
P2ED1D856
P2F90DF28
P2FFAE40A
P30202CAE
P30DB05F5
P31D764AE
P3278EEC7
P334F43A6
P337793B3
P337BB393
P338E7914
P33961491
P33CEF55B
P33F4F171
P348CCA3B
P34B44838
P36962C88
P36FB8EC6
P381A84C8
P3833DD30
P3AA51EB3
P3BBF3EBB
P3F8E116E
P3FAE1A3D
P40878D88
P412EE9B8
P41BF55A7
P424ADC12
P42927191
P433F86DD
P43CDC54B
P44C1CCD4
P44DB1160
P466F2DB5
P46BA774E
P48030325
P4838E6DF
P49444617
P49532218
P49AB4705
P4A4543DF
P4A79CA96
P4AB9700A
P4B747B14
P4BEAE2B4
P4D49B116
P4D50F522
P4DD742D4
P4EE4BFC4
P4FEBBEB1
P500A6D15
P51551359
P51877FA9
P53F474AA
P544359D8
P5588A6DD
P56543757
P566E6516
P5714553A
P57E1008F
P585E9C48
P5B67A599
P5BB99BFC
P5EDEE959
P5FBF9264
P60A476AF
P616F5649
P62393212
P65C7DBB1
P65D46325
P66AA1AAF
P66BFBFB3
P684E8530
P68C596F3
P68FAC246
P6924A626
P6A466669
P6B054081
P6D4F3D7B
P6D506273
P6DD7575A
P7033654F
P7088A075
P70B160A2
P715EB229
P75047777
P77923723
P7823530D
P785CC1E0
P7894C918
P78C27968
P78F45885
P798A42C8
P798ACACA
P79FB9079
P7A77DD3B
P7CB3433E
P7DE04668
P7FD26C6D
P812D0142
P81D5D3BB
P8393A728
P84000EE5
P8430AE09
P85B07A43
P85E42A23
P878B3099
P87E8045D
P881819EC
P88BB8A6D
P88DBDE85
P8A42786E
P8A7F6B8F
P8ADA4C9F
P8B8F3BDF
P8BA93480
P8DF96DDE
P8E2E19D5
P8E30FF75
P8E9577F2
P8F3E3605
P8F7E6716
P8F852FC2
P8F8AE619
P8FAF494E
P8FFF77EA
P9042F799
P92C58293
P944EFA0F
P94796829
P94D449DC
P9529D216
P96979E13
P96AEFF26
P97C23560
P9859AC89
P986A9269
P9908D0FF
P994E04E2
P99519F5A
P99A856E3
P9A170187
P9A9B499B
P9B95BAE7
P9B9FB0B9
P9BBC3EB7
P9C37C008
P9C50163C
P9CE47C63
P9D2789E2
P9E509599
P9EA16235
P9EF2A200
P9F0995F3
P9F413048
P9FA5AF1A
P9FE1569B
PA131370F
PA25DC576
PA3AAA755
PA3B7B725
PA3EEECBC
PA54901B4
PA7FFF0BF
PA801C7E4
PA80E0B8F
PA813872C
PA8E792F4
PAA0B044F
PAA58A835
PAA66A219
PAABDA95B
PAB01114E
PAB14B1EE
PAC99EE65
PACEB2227
PAD4A3EA6
PADB8EB88
PB1A14D69
PB224FC0E
PB4DF81EE
PB5BBB374
PB6381FFB
PB66544E1
PB6D621AB
PB70EC6A0
PB76BB967
PB7F878B9
PB7FA228A
PB844DFBF
PB9296EBB
PBAA77B0B
PBB0761B2
PBB66BD34
PBB71AB42
PBBBD484B
PBC7AA818
PBD55AA4B
PBD821407
PBECA77DE
PBECD3E0A
PC0081177
PC0394549
PC05A4040
PC2657272
PC3A0EF41
PC40A0DED
PC4882A7C
PC5347426
PC5854F18
PC5ED7728
PC66ADC16
PC6BC8C44
PC727A454
PC747646F
PC7B2786D
PC81DFCF1
PC91C4D33
PC91E14C8
PC92B45CA
PCA3C26E9
PCB220247
PCC0110FC
PCCF40755
PCD11A403
PCEAB6469
PCFA9A603
PD00453B4
PD04CFDAB
PD2B30082
PD36AE42F
PD38CBA5A
PD5E7310D
PD614F962
PD72465D7
PD724AD23
PD8794FAF
PD8A69996
PD999B32F
PDAB83DAA
PDACE9744
PDB80E67B
PDC1F94E1
PDC9BC1CC
PDD434042
PDE8203EC
PDE8627A7
PDF16DDFD
PE157C772
PE2747439
PE3120E6D
PE40C4CEB
PE4319A40
PE4839839
PE49C0B92
PE4A7B419
PE4E9EEAE
PE5A4B266
PE5AD226B
PE5B105BF
PE7AD7D99
PE8E41E91
PEC666E34
PECB486E0
PED0BFF13
PED108D67
PEDBD20DB
PF1C85D83
PF2477F09
PF299AD42
PF405D0C0
PF4A7554B
PF59BA2A6
PF73D47F7
PF920EFD9
PF9F5BF88
PFA753BB7
PFB2E808F
PFB73DBBF
PFC6C66C1
PFC8448CB
PFE664748
PFE9F59EF
PFEE9B370
PFF17FF4A
PFF2C68BF
PFF5CF90C
"""

### Remove from local

In [17]:
ocred_pecha_list = [pecha_id for pecha_id in ocred_pechas.splitlines() if pecha_id]
len(ocred_pecha_list)

344

In [18]:
import shutil

In [21]:
for pecha_path in corpus_path.iterdir():
    if pecha_path.name in ocred_pecha_list:
        shutil.rmtree(str(pecha_path))

### remove from corpus_catalog

In [26]:
from openpecha.corpus.download import get_corpus_catalog, get_corpus_items_count, get_request_session
import tqdm

In [None]:
session = get_request_session()
for row in tqdm(cle