# BPE Tokenization

## BPE Tokenization (Content, Sent-per-line)

In [5]:
import os
import sys
import glob
import pickle
import iso8601
import pandas  as pd
from tqdm import tqdm
import multiprocessing
import pyarrow.feather as feather
from pytorch_pretrained_bert import BertTokenizer, BasicTokenizer
from babel.dates import format_date, format_datetime, format_time
multi_process_num = multiprocessing.cpu_count()

ranked_data_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/*.feather"):
    ranked_data_files_paths.append(file_name)
ranked_data_files_paths = sorted(ranked_data_files_paths)    

ranked_entAndsent_files_paths = []
for file_name in glob.glob("0_Corpus/0_NYT_Data_Extraction/2_EntInfor_AND_SentIdxInfor_Files/*.feather"):
    ranked_entAndsent_files_paths.append(file_name)
ranked_entAndsent_files_paths = sorted(ranked_entAndsent_files_paths)    

for file_idx, fpaths in enumerate(zip(ranked_data_files_paths,ranked_entAndsent_files_paths)):
    fpath1,fpath2 = fpaths
    f_idx1 = fpath1.split("/")[-1].split("_")[0]
    f_idx2 = fpath2.split("/")[-1].split("_")[0]
    if f_idx1==f_idx2==str(file_idx):
        continue
    else:
        raise
print(len(ranked_data_files_paths),len(ranked_entAndsent_files_paths))

10 10


In [None]:
filtered_pd = feather.read_feather(ranked_data_files_paths[0])

In [2]:
def tokenize(cased_lines, tokenizer, basic_tokenizer, worker_id, batch_offset):
    sents = []
    for cased_line in cased_lines:
        tokens = basic_tokenizer.tokenize(cased_line)
        split_tokens = []
        for token in tokens:
            subtokens = tokenizer.tokenize(token)
            split_tokens += subtokens
        if len(split_tokens)==0 and cased_line!="":
            continue
        sents.append(split_tokens)
    return worker_id, sents, batch_offset

def get_chunks(fpaths_list, chunk_size):
    data_file_path, sentinfor_file_path = fpaths_list
    chunk = []
    doc_chunk_num = 0
    doc_chunk_size = chunk_size
    filtered_pd = feather.read_feather(data_file_path)
    filtered_pd = filtered_pd.rename(columns={'file_id': 'ID', 'pub': 'date_publish', 'body_text': 'maintext'})
    sentinfor_pd = feather.read_feather(sentinfor_file_path)
    combined_pd = pd.merge(filtered_pd, sentinfor_pd, on='ID')
    assert len(filtered_pd)==len(sentinfor_pd)==len(combined_pd)
    for row_idx, row in combined_pd.iterrows():
        main_text = row["maintext"]
        sent_infor_list = row["sent_infor"]
        docid = row["ID"]
        sent_text_list = []
        for sent_infor in sent_infor_list:
            sent_infor_beg, sent_infor_end = list(map(int, sent_infor.split("_X_")))
            sent_text = main_text[sent_infor_beg:sent_infor_end]
            sent_text_list.append(sent_text)
        for line in sent_text_list:
            line_text = line.strip()
            if len(line_text)!=0:
                chunk.append(line_text)
        chunk.append("")
        doc_chunk_num+=1
        if doc_chunk_num==doc_chunk_size:
            yield chunk
            doc_chunk_num=0
            chunk = []
    yield chunk
    
def process(fpaths_list, chunk_method, output_file, bert_model_type='bert-base-cased', total=100000000, chunk_size=10000, workers=12):
    results = list(range(workers))
    tokenizer = BertTokenizer.from_pretrained(bert_model_type)
    basic_tokenizer = BasicTokenizer(do_lower_case=False)
    fout = open(output_file, 'w')
    offset = 0
    def merge_fn(result):
        worker_id, tokenized, batch_offset = result
        results[worker_id] = tokenized, batch_offset
    for cased_lines in tqdm(get_chunks(fpaths_list, chunk_size), total=total//chunk_size):
        pool = multiprocessing.Pool()
        size = (len(cased_lines) // workers) if len(cased_lines) % workers == 0 else ( 1 + (len(cased_lines) // workers))
        for i in range(workers):
            start = i * size
            pool.apply_async(tokenize, args = (cased_lines[start:start+size], tokenizer, basic_tokenizer, i, start), callback = merge_fn)
        pool.close()
        pool.join()
        for lines, batch_offset in results:
            for line in lines:
                fout.write(' '.join(line) + '\n')
        offset += len(cased_lines)

In [3]:
for file_idx, fpaths in enumerate(zip(ranked_data_files_paths,ranked_entAndsent_files_paths)):
    fdata_file_path,fentAndsent_path = fpaths
    assert(fdata_file_path.split("/")[-1].split("_")[0]==fentAndsent_path.split("/")[-1].split("_")[0]==str(file_idx))
    save_corpus_file = f"0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/{file_idx}_corpus.txt"
    if os.path.exists(save_corpus_file):
        print(f"Have finished {file_idx}th processing.")
        print("-----------------------------------------------------------------")
        continue
    print(f"Start processing {file_idx}th data.")
    process(fpaths, get_chunks, save_corpus_file, workers = multi_process_num)
    print("-----------------------------------------------------------------")

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


Start processing 0th data.


  0%|                                     | 17/10000 [08:15<80:45:13, 29.12s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 1th data.


  0%|                                     | 17/10000 [08:42<85:18:07, 30.76s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 2th data.


  0%|                                     | 17/10000 [08:47<86:02:18, 31.03s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 3th data.


  0%|                                     | 17/10000 [08:52<86:55:10, 31.34s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 4th data.


  0%|                                     | 17/10000 [08:44<85:33:25, 30.85s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 5th data.


  0%|                                     | 17/10000 [08:26<82:42:06, 29.82s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 6th data.


  0%|                                     | 17/10000 [08:28<82:53:48, 29.89s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 7th data.


  0%|                                     | 17/10000 [08:27<82:45:30, 29.84s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 8th data.


  0%|                                     | 17/10000 [08:36<84:10:21, 30.35s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 9th data.


  0%|                                     | 17/10000 [08:44<85:36:31, 30.87s/it]

-----------------------------------------------------------------





## BPE Tokenization (Title&Content, Sent-per-line)

In [1]:
import os
import sys
import glob
import pickle
import iso8601
import pandas  as pd
from tqdm import tqdm
import multiprocessing
import pyarrow.feather as feather
from pytorch_pretrained_bert import BertTokenizer, BasicTokenizer
from babel.dates import format_date, format_datetime, format_time
multi_process_num = multiprocessing.cpu_count()

ranked_data_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/*.feather"):
    ranked_data_files_paths.append(file_name)
ranked_data_files_paths = sorted(ranked_data_files_paths)    

ranked_entAndsent_files_paths = []
for file_name in glob.glob("0_Corpus/0_NYT_Data_Extraction/2_EntInfor_AND_SentIdxInfor_Files/*.feather"):
    ranked_entAndsent_files_paths.append(file_name)
ranked_entAndsent_files_paths = sorted(ranked_entAndsent_files_paths)    

ranked_titleinfor_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/3_EntInfor_AND_SentIdxInfor-Titles_Files/*.feather"):
    ranked_titleinfor_files_paths.append(file_name)
ranked_titleinfor_files_paths = sorted(ranked_titleinfor_files_paths)    

for file_idx, fpaths in enumerate(zip(ranked_data_files_paths,ranked_entAndsent_files_paths,ranked_titleinfor_files_paths)):
    fpath1,fpath2,fpath3 = fpaths
    f_idx1 = fpath1.split("/")[-1].split("_")[0]
    f_idx2 = fpath2.split("/")[-1].split("_")[0]
    f_idx3 = fpath3.split("/")[-1].split("_")[0]
    if f_idx1==f_idx2==f_idx3==str(file_idx):
        continue
    else:
        raise
print(len(ranked_data_files_paths),len(ranked_entAndsent_files_paths),len(ranked_titleinfor_files_paths))

10 10 10


In [2]:
def tokenize(cased_lines, tokenizer, basic_tokenizer, worker_id, batch_offset):
    sents = []
    for cased_line in cased_lines:
        tokens = basic_tokenizer.tokenize(cased_line)
        split_tokens = []
        for token in tokens:
            subtokens = tokenizer.tokenize(token)
            split_tokens += subtokens
        if len(split_tokens)==0 and cased_line!="":
            continue
        sents.append(split_tokens)
    return worker_id, sents, batch_offset

def get_chunks(fpaths_list, chunk_size):
    data_file_path, sentinfor_file_path,titleinfor_file_path = fpaths_list
    chunk = []
    doc_chunk_num = 0
    doc_chunk_size = chunk_size
    filtered_pd = feather.read_feather(data_file_path)
    filtered_pd = filtered_pd.rename(columns={'file_id': 'ID', 'pub': 'date_publish', 'body_text': 'maintext'})
    sentinfor_pd = feather.read_feather(sentinfor_file_path)
    title_pd = feather.read_feather(titleinfor_file_path)
    title_pd = title_pd.rename(columns={'ent_infor': 'title_ent_infor', 'sent_infor': 'title_sent_infor', 'token_num': 'title_token_num'})
    
    combined_pd = pd.merge(filtered_pd, sentinfor_pd, on='ID')
    combined_pd = pd.merge(combined_pd, title_pd, on='ID')
    assert len(filtered_pd)==len(sentinfor_pd)==len(combined_pd)
    for row_idx, row in combined_pd.iterrows():
        main_text = row["maintext"]
        sent_infor_list = row["sent_infor"]
        title_sent_infor = row["title_sent_infor"]
        title_text = row["title_text"]
        docid = row["ID"]
        sent_text_list = []
        for sent_infor in title_sent_infor:
            sent_infor_beg, sent_infor_end = list(map(int, sent_infor.split("_X_")))
            sent_text = title_text[sent_infor_beg:sent_infor_end]
            sent_text_list.append(sent_text)
        for sent_infor in sent_infor_list:
            sent_infor_beg, sent_infor_end = list(map(int, sent_infor.split("_X_")))
            sent_text = main_text[sent_infor_beg:sent_infor_end]
            sent_text_list.append(sent_text)
        for line in sent_text_list:
            line_text = line.strip()
            if len(line_text)!=0:
                chunk.append(line_text)
        chunk.append("")
        doc_chunk_num+=1
        if doc_chunk_num==doc_chunk_size:
            yield chunk
            doc_chunk_num=0
            chunk = []
    yield chunk
    
def process(fpaths_list, chunk_method, output_file, bert_model_type='bert-base-cased', total=100000000, chunk_size=10000, workers=12):
    results = list(range(workers))
    tokenizer = BertTokenizer.from_pretrained(bert_model_type)
    basic_tokenizer = BasicTokenizer(do_lower_case=False)
    fout = open(output_file, 'w')
    offset = 0
    def merge_fn(result):
        worker_id, tokenized, batch_offset = result
        results[worker_id] = tokenized, batch_offset
    for cased_lines in tqdm(get_chunks(fpaths_list, chunk_size), total=total//chunk_size):
        pool = multiprocessing.Pool()
        size = (len(cased_lines) // workers) if len(cased_lines) % workers == 0 else ( 1 + (len(cased_lines) // workers))
        for i in range(workers):
            start = i * size
            pool.apply_async(tokenize, args = (cased_lines[start:start+size], tokenizer, basic_tokenizer, i, start), callback = merge_fn)
        pool.close()
        pool.join()
        for lines, batch_offset in results:
            for line in lines:
                fout.write(' '.join(line) + '\n')
        offset += len(cased_lines)
        

In [3]:
for file_idx, fpaths in enumerate(zip(ranked_data_files_paths,ranked_entAndsent_files_paths,ranked_titleinfor_files_paths)):
    fdata_file_path,fentAndsent_path,ftitleinfor_path = fpaths
    assert(fdata_file_path.split("/")[-1].split("_")[0]==fentAndsent_path.split("/")[-1].split("_")[0]==ftitleinfor_path.split("/")[-1].split("_")[0]==str(file_idx))
    save_corpus_file = f"0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/{file_idx}_corpus.txt"
    if os.path.exists(save_corpus_file):
        print(f"Have finished {file_idx}th processing.")
        print("-----------------------------------------------------------------")
        continue
    print(f"Start processing {file_idx}th data.")
    process(fpaths, get_chunks, save_corpus_file, workers = multi_process_num)
    print("-----------------------------------------------------------------")

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


Start processing 0th data.


  0%|                                     | 17/10000 [08:45<85:42:24, 30.91s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 1th data.


  0%|                                     | 17/10000 [08:57<87:40:37, 31.62s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 2th data.


  0%|                                     | 17/10000 [08:44<85:30:36, 30.84s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 3th data.


  0%|                                     | 17/10000 [08:44<85:37:03, 30.87s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 4th data.


  0%|                                     | 17/10000 [08:37<84:21:18, 30.42s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 5th data.


  0%|                                     | 17/10000 [08:39<84:42:48, 30.55s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 6th data.


  0%|                                     | 17/10000 [08:41<85:01:15, 30.66s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 7th data.


  0%|                                     | 17/10000 [08:39<84:41:30, 30.54s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 8th data.


  0%|                                     | 17/10000 [08:43<85:23:48, 30.80s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 9th data.


  0%|                                     | 17/10000 [08:52<86:51:02, 31.32s/it]

-----------------------------------------------------------------





## BPE Tokenization (Title&Content&ID, Sent-per-line,ID is appened as last sentence)

In [1]:
import os
import sys
import glob
import pickle
import iso8601
import pandas  as pd
from tqdm import tqdm
import multiprocessing
import pyarrow.feather as feather
from pytorch_pretrained_bert import BertTokenizer, BasicTokenizer
from babel.dates import format_date, format_datetime, format_time
multi_process_num = multiprocessing.cpu_count()

ranked_data_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/*.feather"):
    ranked_data_files_paths.append(file_name)
ranked_data_files_paths = sorted(ranked_data_files_paths)    

ranked_entAndsent_files_paths = []
for file_name in glob.glob("0_Corpus/0_NYT_Data_Extraction/2_EntInfor_AND_SentIdxInfor_Files/*.feather"):
    ranked_entAndsent_files_paths.append(file_name)
ranked_entAndsent_files_paths = sorted(ranked_entAndsent_files_paths)    

ranked_titleinfor_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/3_EntInfor_AND_SentIdxInfor-Titles_Files/*.feather"):
    ranked_titleinfor_files_paths.append(file_name)
ranked_titleinfor_files_paths = sorted(ranked_titleinfor_files_paths)    

for file_idx, fpaths in enumerate(zip(ranked_data_files_paths,ranked_entAndsent_files_paths,ranked_titleinfor_files_paths)):
    fpath1,fpath2,fpath3 = fpaths
    f_idx1 = fpath1.split("/")[-1].split("_")[0]
    f_idx2 = fpath2.split("/")[-1].split("_")[0]
    f_idx3 = fpath3.split("/")[-1].split("_")[0]
    if f_idx1==f_idx2==f_idx3==str(file_idx):
        continue
    else:
        raise
print(len(ranked_data_files_paths),len(ranked_entAndsent_files_paths),len(ranked_titleinfor_files_paths))

10 10 10


In [2]:
def tokenize(cased_lines, tokenizer, basic_tokenizer, worker_id, batch_offset):
    sents = []
    for cased_line in cased_lines:
        tokens = basic_tokenizer.tokenize(cased_line)
        split_tokens = []
        for token in tokens:
            subtokens = tokenizer.tokenize(token)
            split_tokens += subtokens
        if len(split_tokens)==0 and cased_line!="":
            continue
        sents.append(split_tokens)
    return worker_id, sents, batch_offset

def get_chunks(fpaths_list, chunk_size):
    data_file_path, sentinfor_file_path,titleinfor_file_path = fpaths_list
    chunk = []
    doc_chunk_num = 0
    doc_chunk_size = chunk_size
    filtered_pd = feather.read_feather(data_file_path)
    filtered_pd = filtered_pd.rename(columns={'file_id': 'ID', 'pub': 'date_publish', 'body_text': 'maintext'})
    sentinfor_pd = feather.read_feather(sentinfor_file_path)
    title_pd = feather.read_feather(titleinfor_file_path)
    title_pd = title_pd.rename(columns={'ent_infor': 'title_ent_infor', 'sent_infor': 'title_sent_infor', 'token_num': 'title_token_num'})
    
    combined_pd = pd.merge(filtered_pd, sentinfor_pd, on='ID')
    combined_pd = pd.merge(combined_pd, title_pd, on='ID')
    assert len(filtered_pd)==len(sentinfor_pd)==len(combined_pd)
    for row_idx, row in combined_pd.iterrows():
        main_text = row["maintext"]
        sent_infor_list = row["sent_infor"]
        title_sent_infor = row["title_sent_infor"]
        title_text = row["title_text"]
        docid = row["ID"]
        sent_text_list = []
        ###append title as the first paragraph###
        for sent_infor in title_sent_infor:
            sent_infor_beg, sent_infor_end = list(map(int, sent_infor.split("_X_")))
            sent_text = title_text[sent_infor_beg:sent_infor_end]
            sent_text_list.append(sent_text)
        for sent_infor in sent_infor_list:
            sent_infor_beg, sent_infor_end = list(map(int, sent_infor.split("_X_")))
            sent_text = main_text[sent_infor_beg:sent_infor_end]
            sent_text_list.append(sent_text)
        ###append docid as the last sentence###
        sent_text_list.append(docid)
        for line in sent_text_list:
            line_text = line.strip()
            if len(line_text)!=0:
                chunk.append(line_text)
        chunk.append("")
        doc_chunk_num+=1
        if doc_chunk_num==doc_chunk_size:
            yield chunk
            doc_chunk_num=0
            chunk = []
    yield chunk
    
def process(fpaths_list, chunk_method, output_file, bert_model_type='bert-base-cased', total=100000000, chunk_size=10000, workers=12):
    results = list(range(workers))
    tokenizer = BertTokenizer.from_pretrained(bert_model_type)
    basic_tokenizer = BasicTokenizer(do_lower_case=False)
    fout = open(output_file, 'w')
    offset = 0
    def merge_fn(result):
        worker_id, tokenized, batch_offset = result
        results[worker_id] = tokenized, batch_offset
    for cased_lines in tqdm(get_chunks(fpaths_list, chunk_size), total=total//chunk_size):
        pool = multiprocessing.Pool()
        size = (len(cased_lines) // workers) if len(cased_lines) % workers == 0 else ( 1 + (len(cased_lines) // workers))
        for i in range(workers):
            start = i * size
            pool.apply_async(tokenize, args = (cased_lines[start:start+size], tokenizer, basic_tokenizer, i, start), callback = merge_fn)
        pool.close()
        pool.join()
        for lines, batch_offset in results:
            for line in lines:
                fout.write(' '.join(line) + '\n')
        offset += len(cased_lines)

In [3]:
for file_idx, fpaths in enumerate(zip(ranked_data_files_paths,ranked_entAndsent_files_paths,ranked_titleinfor_files_paths)):
    fdata_file_path,fentAndsent_path,ftitleinfor_path = fpaths
    assert(fdata_file_path.split("/")[-1].split("_")[0]==fentAndsent_path.split("/")[-1].split("_")[0]==ftitleinfor_path.split("/")[-1].split("_")[0]==str(file_idx))
    save_corpus_file = f"0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/{file_idx}_corpus.txt"
    if os.path.exists(save_corpus_file):
        print(f"Have finished {file_idx}th processing.")
        print("-----------------------------------------------------------------")
        continue
    print(f"Start processing {file_idx}th data.")
    process(fpaths, get_chunks, save_corpus_file, workers = multi_process_num)
    print("-----------------------------------------------------------------")

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


Start processing 0th data.


  0%|                                     | 17/10000 [08:21<81:46:14, 29.49s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 1th data.


  0%|                                     | 17/10000 [08:49<86:26:49, 31.17s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 2th data.


  0%|                                     | 17/10000 [09:06<89:11:09, 32.16s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 3th data.


  0%|                                     | 17/10000 [08:53<86:57:15, 31.36s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 4th data.


  0%|                                     | 17/10000 [08:48<86:11:56, 31.08s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 5th data.


  0%|                                     | 17/10000 [08:47<86:00:22, 31.01s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 6th data.


  0%|                                     | 17/10000 [08:47<86:07:14, 31.06s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 7th data.


  0%|                                     | 17/10000 [08:47<86:03:39, 31.03s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 8th data.


  0%|                                     | 17/10000 [08:52<86:50:04, 31.31s/it]
The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


-----------------------------------------------------------------
Start processing 9th data.


  0%|                                     | 17/10000 [08:48<86:12:24, 31.09s/it]

-----------------------------------------------------------------





# Tokenize Pretraining Corpus (Train-Valid-Test Split)

## Content

In [1]:
import os
import glob

ranked_corpus_files_paths = []
for file_name in glob.glob("0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/*.txt"):
    ranked_corpus_files_paths.append(file_name)
ranked_corpus_files_paths = sorted(ranked_corpus_files_paths)
print(len(ranked_corpus_files_paths))

train_corpus_files_list = ranked_corpus_files_paths[:9]
valtest_corpus_files_list = ranked_corpus_files_paths[9:]
print(len(train_corpus_files_list), len(valtest_corpus_files_list))

10
9 1


In [2]:
train_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/0_corpus_content-only/0_corpus_train.txt"
fout = open(train_corpus_fout, 'w')
for train_file_paths in train_corpus_files_list:
    doc_num = 0
    print(train_file_paths,end=": ")
    corpus_file = open(train_file_paths)
    for line in corpus_file:
        if line=="\n":
            doc_num+=1
        fout.write(line)
    print(doc_num)

0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/0_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/1_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/2_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/3_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/4_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/5_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/6_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/7_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/8_corpus.txt: 165234


In [3]:
val_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/0_corpus_content-only/0_corpus_val.txt"
test_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/0_corpus_content-only/0_corpus_test.txt"
val_fout = open(val_corpus_fout, 'w')
test_fout = open(test_corpus_fout, 'w')

total_doc_num = 0
for valtest_file_paths in valtest_corpus_files_list:
    print(valtest_file_paths)
    with open(valtest_file_paths) as corpus_file:    
        for line in corpus_file:
            if line=="\n":
                total_doc_num+=1
print("total_doc_num:", total_doc_num)

split_doc_num = 0
for valtest_file_paths in valtest_corpus_files_list:
    print(valtest_file_paths)
    val_doc_num,test_doc_num = 0,0
    with open(valtest_file_paths) as corpus_file:    
        for line in corpus_file:
            if line=="\n":
                split_doc_num+=1
            if split_doc_num<=total_doc_num//2:
                val_fout.write(line)
                if line=="\n":
                    val_doc_num+=1
            else:
                test_fout.write(line)
                if line=="\n":
                    test_doc_num+=1
    print("val_doc_num:", val_doc_num)
    print("test_doc_num:", test_doc_num)

0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/9_corpus.txt
total_doc_num: 165241
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/0_corpus_content-only_files/9_corpus.txt
val_doc_num: 82620
test_doc_num: 82621


### Check

In [1]:
train_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/0_corpus_content-only/0_corpus_train.txt"
doc_num1 = 0
corpus_file = open(train_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num1+=1
print(doc_num1, 165234*9)

1487106 1487106


In [2]:
val_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/0_corpus_content-only/0_corpus_val.txt"
doc_num2 = 0
corpus_file = open(val_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num2+=1
print(doc_num2, 165241//2)

82620 82620


In [3]:
test_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/0_corpus_content-only/0_corpus_test.txt"
doc_num3 = 0
corpus_file = open(test_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num3+=1
print(doc_num3, 165241-165241//2)

82621 82621


In [4]:
print(doc_num1+doc_num2+doc_num3,doc_num1+doc_num2+doc_num3==1652347)

1652347 True


## Title&Content

In [1]:
import os
import glob

ranked_corpus_files_paths = []
for file_name in glob.glob("0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/*.txt"):
    ranked_corpus_files_paths.append(file_name)
ranked_corpus_files_paths = sorted(ranked_corpus_files_paths)
print(len(ranked_corpus_files_paths))

train_corpus_files_list = ranked_corpus_files_paths[:9]
valtest_corpus_files_list = ranked_corpus_files_paths[9:]
print(len(train_corpus_files_list), len(valtest_corpus_files_list))

10
9 1


In [2]:
train_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/1_corpus_titlecontent/0_corpus_train.txt"
fout = open(train_corpus_fout, 'w')
for train_file_paths in train_corpus_files_list:
    doc_num = 0
    print(train_file_paths,end=": ")
    corpus_file = open(train_file_paths)
    for line in corpus_file:
        if line=="\n":
            doc_num+=1
        fout.write(line)
    print(doc_num)

0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/0_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/1_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/2_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/3_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/4_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/5_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/6_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/7_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/8_corpus.txt: 165234


In [3]:
val_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/1_corpus_titlecontent/0_corpus_val.txt"
test_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/1_corpus_titlecontent/0_corpus_test.txt"
val_fout = open(val_corpus_fout, 'w')
test_fout = open(test_corpus_fout, 'w')

total_doc_num = 0
for valtest_file_paths in valtest_corpus_files_list:
    print(valtest_file_paths)
    with open(valtest_file_paths) as corpus_file:    
        for line in corpus_file:
            if line=="\n":
                total_doc_num+=1
print("total_doc_num:", total_doc_num)

split_doc_num = 0
for valtest_file_paths in valtest_corpus_files_list:
    print(valtest_file_paths)
    val_doc_num,test_doc_num = 0,0
    with open(valtest_file_paths) as corpus_file:    
        for line in corpus_file:
            if line=="\n":
                split_doc_num+=1
            if split_doc_num<=total_doc_num//2:
                val_fout.write(line)
                if line=="\n":
                    val_doc_num+=1
            else:
                test_fout.write(line)
                if line=="\n":
                    test_doc_num+=1
    print("val_doc_num:", val_doc_num)
    print("test_doc_num:", test_doc_num)

0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/9_corpus.txt
total_doc_num: 165241
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/1_corpus_titlecontent_files/9_corpus.txt
val_doc_num: 82620
test_doc_num: 82621


### Check

In [1]:
train_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/1_corpus_titlecontent/0_corpus_train.txt"
doc_num1 = 0
corpus_file = open(train_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num1+=1
print(doc_num1, 165234*9)

1487106 1487106


In [2]:
val_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/1_corpus_titlecontent/0_corpus_val.txt"
doc_num2 = 0
corpus_file = open(val_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num2+=1
print(doc_num2, 165241//2)

82620 82620


In [3]:
test_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/1_corpus_titlecontent/0_corpus_test.txt"
doc_num3 = 0
corpus_file = open(test_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num3+=1
print(doc_num3, 165241-165241//2)

82621 82621


In [4]:
print(doc_num1+doc_num2+doc_num3,doc_num1+doc_num2+doc_num3==1652347)

1652347 True


## Title&Content&ID, ID is appened as last sentence

In [1]:
import os
import glob

ranked_corpus_files_paths = []
for file_name in glob.glob("0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/*.txt"):
    ranked_corpus_files_paths.append(file_name)
ranked_corpus_files_paths = sorted(ranked_corpus_files_paths)
print(len(ranked_corpus_files_paths))

train_corpus_files_list = ranked_corpus_files_paths[:9]
valtest_corpus_files_list = ranked_corpus_files_paths[9:]
print(len(train_corpus_files_list), len(valtest_corpus_files_list))

10
9 1


In [2]:
train_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/2_corpus_titlecontent_withdocid/0_corpus_train.txt"
fout = open(train_corpus_fout, 'w')
for train_file_paths in train_corpus_files_list:
    doc_num = 0
    print(train_file_paths,end=": ")
    corpus_file = open(train_file_paths)
    for line in corpus_file:
        if line=="\n":
            doc_num+=1
        fout.write(line)
    print(doc_num)

0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/0_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/1_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/2_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/3_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/4_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/5_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/6_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/7_corpus.txt: 165234
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_wi

In [4]:
val_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/2_corpus_titlecontent_withdocid/0_corpus_val.txt"
test_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/2_corpus_titlecontent_withdocid/0_corpus_test.txt"
val_fout = open(val_corpus_fout, 'w')
test_fout = open(test_corpus_fout, 'w')

total_doc_num = 0
for valtest_file_paths in valtest_corpus_files_list:
    print(valtest_file_paths)
    with open(valtest_file_paths) as corpus_file:    
        for line in corpus_file:
            if line=="\n":
                total_doc_num+=1
print("total_doc_num:", total_doc_num)

split_doc_num = 0
for valtest_file_paths in valtest_corpus_files_list:
    print(valtest_file_paths)
    val_doc_num,test_doc_num = 0,0
    with open(valtest_file_paths) as corpus_file:    
        for line in corpus_file:
            if line=="\n":
                split_doc_num+=1
            if split_doc_num<=total_doc_num//2:
                val_fout.write(line)
                if line=="\n":
                    val_doc_num+=1
            else:
                test_fout.write(line)
                if line=="\n":
                    test_doc_num+=1
    print("val_doc_num:", val_doc_num)
    print("test_doc_num:", test_doc_num)

0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/9_corpus.txt
total_doc_num: 165241
0_Corpus/1_Pretraining_Preprocessing/0_BPE_Tokenization/2_corpus_titlecontent_withdocid_files/9_corpus.txt
val_doc_num: 82620
test_doc_num: 82621


### Check

In [1]:
train_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/2_corpus_titlecontent_withdocid/0_corpus_train.txt"
doc_num1 = 0
corpus_file = open(train_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num1+=1
print(doc_num1, 165234*9)

1487106 1487106


In [2]:
val_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/2_corpus_titlecontent_withdocid/0_corpus_val.txt"
doc_num2 = 0
corpus_file = open(val_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num2+=1
print(doc_num2, 165241//2)

82620 82620


In [3]:
test_corpus_fout = "0_Corpus/1_Pretraining_Preprocessing/1_Tokenize_Pretraining_Corpus/2_corpus_titlecontent_withdocid/0_corpus_test.txt"
doc_num3 = 0
corpus_file = open(test_corpus_fout)
for line in corpus_file:
    if line=="\n":
        doc_num3+=1
print(doc_num3, 165241-165241//2)

82621 82621


In [4]:
print(doc_num1+doc_num2+doc_num3,doc_num1+doc_num2+doc_num3==1652347)

1652347 True
