# Temporal_Information_Extraction

## Temporal_Information_Extraction (maintext)

In [None]:
import os
import sys
import glob
import pickle
import time
import pandas  as pd
from tqdm import tqdm
import pyarrow.feather as feather
from pytorch_pretrained_bert import BertTokenizer, BasicTokenizer
from babel.dates import format_date, format_datetime, format_time
import warnings
warnings.filterwarnings("ignore")
import multiprocessing
import tqdm.notebook as tqdm
from tqdm.contrib.concurrent import thread_map
multi_process_num = multiprocessing.cpu_count()
from sutime import SUTime
sutime = SUTime(mark_time_ranges=True, include_range=True)

In [2]:
ranked_data_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/*.feather"):
    ranked_data_files_paths.append(file_name)
ranked_data_files_paths = sorted(ranked_data_files_paths)    

In [None]:
def sutime_results(pd_input):
    row_idx, row = pd_input
    main_text = row["maintext"]
    date_publish = row["date_publish"]
    date_publish=date_publish[:4]+'-'+date_publish[4:6]+'-'+date_publish[6:8]
    docid = row["ID"]
    sutime_result=sutime.parse(main_text,reference_date=date_publish)
    if row_idx%1000==0:
        print(row_idx,end="; ")
    return (docid, sutime_result)

for file_idx, data_file_path in enumerate(ranked_data_files_paths):
    start_t = time.time()
    filtered_pd = feather.read_feather(data_file_path)
    print(data_file_path,len(filtered_pd))
    filtered_pd = filtered_pd.rename(columns={'file_id': 'ID', 'pub': 'date_publish', 'body_text': 'maintext'})
    docid2tempinfor_dict=dict()
    save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/0_Maintext_TempInfor_Files/{file_idx}_maintext_sutime_tempinfor.pickle"
    result_dict = dict(thread_map(sutime_results, filtered_pd.iterrows(), tqdm_class=tqdm.tqdm, max_workers=multi_process_num))
    pickle.dump(result_dict, open(save_file, "wb"))
    end_t = time.time()
    print(f"Time use: {end_t-start_t}")

## Temporal_Information_Extraction (title)

In [None]:
import os
import sys
import glob
import pickle
import time
import pandas  as pd
from tqdm import tqdm
import pyarrow.feather as feather
from pytorch_pretrained_bert import BertTokenizer, BasicTokenizer
from babel.dates import format_date, format_datetime, format_time
import warnings
warnings.filterwarnings("ignore")
import multiprocessing
import tqdm.notebook as tqdm
from tqdm.contrib.concurrent import thread_map
multi_process_num = multiprocessing.cpu_count()
from sutime import SUTime
sutime = SUTime(mark_time_ranges=True, include_range=True)

In [2]:
ranked_titleinfor_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/3_EntInfor_AND_SentIdxInfor-Titles_Files/*.feather"):
    ranked_titleinfor_files_paths.append(file_name)
ranked_titleinfor_files_paths = sorted(ranked_titleinfor_files_paths)
docid2timestamp_dict = pickle.load(open("./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/0_docid2timestamp.pickle",'rb'))

In [None]:
def sutime_results(pd_input):
    row_idx, row = pd_input
    main_text = row["maintext"]
    date_publish = row["date_publish"]
    date_publish=date_publish[:4]+'-'+date_publish[4:6]+'-'+date_publish[6:8]
    docid = row["ID"]
    sutime_result=sutime.parse(main_text,reference_date=date_publish)
    if row_idx%1000==0:
        print(row_idx,end="; ")
    return (docid, sutime_result)

for file_idx, data_file_path in enumerate(ranked_titleinfor_files_paths):
    start_t = time.time()
    filtered_pd = feather.read_feather(data_file_path).iloc[:10]
    print(data_file_path,len(filtered_pd))
    filtered_pd = filtered_pd.rename(columns={'file_id': 'ID', 'pub': 'date_publish', 'title_text': 'maintext'})
    filtered_pd['date_publish']= filtered_pd['ID'].map(docid2timestamp_dict)
    save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/1_Titletext_TempInfor_Files/{file_idx}_titletext_sutime_tempinfor.pickle"
    result_dict = dict(thread_map(sutime_results, filtered_pd.iterrows(), tqdm_class=tqdm.tqdm, max_workers=multi_process_num))
    pickle.dump(result_dict, open(save_file, "wb"))
    end_t = time.time()
    print(f"Time use: {end_t-start_t}")


# Temporal_Information_Filtering

In [1]:
import re
import glob
import pickle
import pandas as pd
import pyarrow.feather as feather

In [2]:
maintext_tempinfor_files_paths = []
for file_name in glob.glob(f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/0_Maintext_TempInfor_Files/*.pickle"):
    maintext_tempinfor_files_paths.append(file_name)
maintext_tempinfor_files_paths = sorted(maintext_tempinfor_files_paths)

titletext_tempinfor_files_paths = []
for file_name in glob.glob(f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/1_Titletext_TempInfor_Files/*.pickle"):
    titletext_tempinfor_files_paths.append(file_name)
titletext_tempinfor_files_paths = sorted(titletext_tempinfor_files_paths)

ranked_data_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/*.feather"):
    ranked_data_files_paths.append(file_name)
ranked_data_files_paths = sorted(ranked_data_files_paths)

ranked_entAndsent_files_paths = []
for file_name in glob.glob("0_Corpus/0_NYT_Data_Extraction/2_EntInfor_AND_SentIdxInfor_Files/*.feather"):
    ranked_entAndsent_files_paths.append(file_name)
ranked_entAndsent_files_paths = sorted(ranked_entAndsent_files_paths)    

ranked_titleinfor_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/3_EntInfor_AND_SentIdxInfor-Titles_Files/*.feather"):
    ranked_titleinfor_files_paths.append(file_name)
ranked_titleinfor_files_paths = sorted(ranked_titleinfor_files_paths)   

print(len(maintext_tempinfor_files_paths), len(titletext_tempinfor_files_paths), len(ranked_data_files_paths), len(ranked_entAndsent_files_paths), len(ranked_titleinfor_files_paths))

for file_idx, fpaths in enumerate(zip(maintext_tempinfor_files_paths, ranked_data_files_paths, titletext_tempinfor_files_paths,ranked_entAndsent_files_paths, ranked_titleinfor_files_paths)):
    fpath1,fpath2,fpath3,fpath4,fpath5= fpaths
    f_idx1 = fpath1.split("/")[-1].split("_")[0]
    f_idx2 = fpath2.split("/")[-1].split("_")[0]
    f_idx3 = fpath3.split("/")[-1].split("_")[0]
    f_idx4 = fpath4.split("/")[-1].split("_")[0]
    f_idx5 = fpath5.split("/")[-1].split("_")[0]
    if f_idx1==f_idx2==f_idx3==f_idx4==f_idx5==str(file_idx):
        continue
    else:
        raise

10 10 10 10 10


In [3]:
def skip_temp_fun(text):
    skip_temp_tag = False
    if re.search('(.*weeks?)|(.*weekends?)|(.*years?)|(.*decades?)', text.lower()):
        skip_temp_tag = True
    if re.search('(several)|(a few)', text.lower()):
        skip_temp_tag = True
    if re.search("(the day)|(the other day)|(millennium)|(century)|(centuries)",text.lower()):
        skip_temp_tag = True
    if re.search("(more)|(less)",text.lower()):
        skip_temp_tag = True
    return skip_temp_tag

def sutimetemp_filtering_fun(tempinfor):
    temp_list = []
    for temp_dict in tempinfor:
        try:
            if temp_dict['type']=="DATE":
                temp_text = temp_dict['text']
                skip_temp_tag = skip_temp_fun(temp_text)
                if skip_temp_tag:
                    continue
                t_infor = [temp_text, temp_dict['value'], temp_dict['start'], temp_dict['end']]
                if re.match(r'(([1-2]\d{3})-((0[1-9])|(1[0-2]))-((0[1-9])|([1,2][0-9])|(3[0-1])))',temp_dict['value']):
                    t_infor.append("day")
                    temp_list.append(t_infor)
                    continue
                if re.match(r'(([1-2]\d{3})-((0[1-9])|(1[0-2])))',temp_dict['value']):
                    t_infor.append("month")
                    temp_list.append(t_infor)
                    continue
                if re.match(r'([1-2]\d{3})',temp_dict['value']):
                    t_infor.append("year")
                    temp_list.append(t_infor)
                    continue
        except:
            continue
    return temp_list

In [4]:
def return_filtered_temp_infor(text,raw_sutimetemp_infor, sent_infor_list, ent_infor_list):
    sent_pos_list = []
    sentidx2newsentidx = dict()
    newsentidx = -1
    for sent_i,sent_infor in enumerate(sent_infor_list):  
        b_pos, e_pos = list(map(int, sent_infor.split("_X_")))
        sent_pos_list.append([b_pos, e_pos])
        line_text = text[b_pos:e_pos].strip()
        if len(line_text)!=0:
            newsentidx+=1
        sentidx2newsentidx[sent_i] = newsentidx
    
    filtered_sutimetemp_infor = []
    if len(raw_sutimetemp_infor)!=0:
        sutimetemp_infor = sutimetemp_filtering_fun(raw_sutimetemp_infor)
        enttemp_pos  = []
        for entinfor in ent_infor_list:
            if entinfor.endswith('DATE'):
                enttemp_pos.append(list(map(int,entinfor.split("_X_")[1:3])))
        for temp_infor in sutimetemp_infor:
            ent_pos_tag = False
            temp_text = temp_infor[0]
            b_pos = temp_infor[2]
            e_pos = temp_infor[3]
            temp_grad = temp_infor[4]
            for ent_pos in enttemp_pos:
                if b_pos==ent_pos[0] and e_pos==ent_pos[1]:
                    ent_pos_tag = True
                    break
            if ent_pos_tag:
                for sent_i,sent_pos in enumerate(sent_pos_list):
                    if sent_pos[0]<=b_pos<=e_pos<=sent_pos[1]:
                        t_infor = [temp_text, temp_grad, [b_pos, e_pos], [sent_i, sentidx2newsentidx[sent_i]]]
                        filtered_sutimetemp_infor.append(t_infor)
                        break
    return filtered_sutimetemp_infor, [sent_pos_list,sentidx2newsentidx]

In [5]:
docid2tempinfor_dict = dict()
for file_idx, fpaths in enumerate(zip(maintext_tempinfor_files_paths,titletext_tempinfor_files_paths, ranked_data_files_paths, ranked_entAndsent_files_paths, ranked_titleinfor_files_paths)):
    maintext_tempinfor_file, titletext_tempinfor_file, data_file, entAndsent_file, titleinfor_file = fpaths
    docid2maintext_tempinfor_dict = pickle.load(open(maintext_tempinfor_file,'rb'))
    docid2titletext_tempinfor_dict = pickle.load(open(titletext_tempinfor_file,'rb'))
    
    text_data = feather.read_feather(data_file)
    entAndsent_data = feather.read_feather(entAndsent_file)
    titleinfor_data = feather.read_feather(titleinfor_file)
    print(data_file)
    
    text_data = text_data.rename(columns={'body_text': 'main_text', 'pub': 'date_publish', 'file_id': 'ID'})
    entAndsent_data = entAndsent_data.rename(columns={'ent_infor': 'main_ent_infor', 'sent_infor': 'main_sent_infor'})
    titleinfor_data = titleinfor_data.rename(columns={'ent_infor': 'title_ent_infor', 'sent_infor': 'title_sent_infor'})
    merged_pd = pd.merge(text_data, entAndsent_data, on='ID')
    merged_pd = pd.merge(merged_pd, titleinfor_data, on='ID')
    merged_pd = merged_pd[["main_text", "main_ent_infor", "main_sent_infor", "title_text", "title_ent_infor", "title_sent_infor", "date_publish", "ID"]]
    for row_idx, row in merged_pd.iterrows():
        if row_idx%10000==0:
            print(row_idx,end="; ")
        main_text = row["main_text"]
        text_sent_infor = row["main_sent_infor"]
        text_ent_infor = row["main_ent_infor"]
        title_text = row["title_text"]
        title_ent_infor = row["title_ent_infor"]
        title_sent_infor = row["title_sent_infor"]
        date_publish = row["date_publish"]
        docid = row["ID"]
        main_sutimetemp_infor = docid2maintext_tempinfor_dict[docid]
        main_filtered_temp_infor, main_sent_pos_list = return_filtered_temp_infor(main_text, main_sutimetemp_infor, text_sent_infor, text_ent_infor)
        title_sutimetemp_infor = docid2titletext_tempinfor_dict[docid]
        title_filtered_temp_infor, title_sent_pos_list = return_filtered_temp_infor(title_text, title_sutimetemp_infor, title_sent_infor, title_ent_infor)
        docid2tempinfor_dict[docid] = [main_filtered_temp_infor, main_sent_pos_list, title_filtered_temp_infor, title_sent_pos_list]

./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/0_NYT_filtered_data.feather
0; 10000; 20000; 30000; 40000; 50000; 60000; 70000; 80000; 90000; 100000; 110000; 120000; 130000; 140000; 150000; 160000; ./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/1_NYT_filtered_data.feather
0; 10000; 20000; 30000; 40000; 50000; 60000; 70000; 80000; 90000; 100000; 110000; 120000; 130000; 140000; 150000; 160000; ./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/2_NYT_filtered_data.feather
0; 10000; 20000; 30000; 40000; 50000; 60000; 70000; 80000; 90000; 100000; 110000; 120000; 130000; 140000; 150000; 160000; ./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/3_NYT_filtered_data.feather
0; 10000; 20000; 30000; 40000; 50000; 60000; 70000; 80000; 90000; 100000; 110000; 120000; 130000; 140000; 150000; 160000; ./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/4_NYT_filtered_data.feather
0; 10000; 20000; 30000; 40000; 50000; 60000; 70000; 80000; 90000; 100000; 110000; 

In [6]:
save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/2_Extracted_TempInfor/filtered_tempinfor_sutime.pickle"
pickle.dump(docid2tempinfor_dict, open(save_file, "wb"))

# Temporal_Information_Tokenization

In [1]:
import pickle
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

docid2tempinfor_dict_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/2_Extracted_TempInfor/filtered_tempinfor_sutime.pickle"
docid2tempinfor_dict = pickle.load(open(docid2tempinfor_dict_file,'rb'))
print(len(docid2tempinfor_dict))

FileNotFoundError: [Errno 2] No such file or directory: './0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/2_Extracted_TempInfor/filtered_tempinfor_sutime.pickle'

In [9]:
temp_text = """ The charges against Mr. Smith, who was born in Brooklyn and raised in Uniondale, on Long Island, have followed his career as a major figure in rap music and videos during the 1990s. Before 2006 he had largely stayed clear of the kind of violence and criminality associated with the personal lives and music of some other pioneering rap artists, including the Notorious B.I.G. and Tupac Shakur, who were killed in drive-by shootings."""

print(tokenizer.encode(temp_text, add_special_tokens=False))

[1109, 4917, 1222, 1828, 119, 2159, 117, 1150, 1108, 1255, 1107, 6010, 1105, 2120, 1107, 1913, 4319, 117, 1113, 3261, 2054, 117, 1138, 1723, 1117, 1578, 1112, 170, 1558, 2482, 1107, 12488, 1390, 1105, 6581, 1219, 1103, 3281, 119, 2577, 1386, 1119, 1125, 3494, 3523, 2330, 1104, 1103, 1912, 1104, 4289, 1105, 4771, 1785, 2628, 1114, 1103, 2357, 2491, 1105, 1390, 1104, 1199, 1168, 14024, 12488, 2719, 117, 1259, 1103, 1753, 19402, 139, 119, 146, 119, 144, 119, 1105, 17037, 4163, 1665, 156, 21893, 2149, 117, 1150, 1127, 1841, 1107, 2797, 118, 1118, 4598, 1116, 119]


In [11]:
" ".join(tokenizer.convert_ids_to_tokens(tokenizer.encode(temp_text, add_special_tokens=False)))

'The charges against Mr . Smith , who was born in Brooklyn and raised in Union ##dale , on Long Island , have followed his career as a major figure in rap music and videos during the 1990s . Before 2006 he had largely stayed clear of the kind of violence and criminal ##ity associated with the personal lives and music of some other pioneering rap artists , including the Not ##orious B . I . G . and Tu ##pa ##c S ##hak ##ur , who were killed in drive - by shooting ##s .'

In [4]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_batch_prepare_for_model',
 '_bos_token',
 '_call_one',
 '_cls_token',
 '_convert_id_to_token',
 '_convert_token_to_id',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_create_trie',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager

In [2]:
docid2temptokenizeinfor_dict = dict()
row_idx = 0
for docid, tempinfor in docid2tempinfor_dict.items():
    row_idx+=1
    if row_idx%10000==0:
        print(row_idx,end="; ")  
    main_temp_infor, main_sent_pos_infor, title_temp_infor, title_sent_pos_infor = tempinfor
    _,title_sentidx2newsentidx = title_sent_pos_infor
    title_sent_num = title_sentidx2newsentidx[len(title_sentidx2newsentidx)-1]+1
    tokenize_infor_results = []
    for temp_infor in title_temp_infor:
        temp_text = temp_infor[0]
        temp_grad = temp_infor[1]
        temp_sent_idx = temp_infor[3][1]
        token_result = tokenizer.encode(temp_text, add_special_tokens=False)
        if temp_grad=="month" or temp_grad=="year":
            if len(token_result)>8:
                continue
        tokenize_infor_results.append([temp_text, temp_grad, token_result, temp_sent_idx])
    for temp_infor in main_temp_infor:
        temp_text = temp_infor[0]
        temp_grad = temp_infor[1]
        temp_sent_idx = temp_infor[3][1]+title_sent_num
        token_result = tokenizer.encode(temp_text, add_special_tokens=False)
        if temp_grad=="month" or temp_grad=="year":
            if len(token_result)>8:
                continue
        tokenize_infor_results.append([temp_text, temp_grad, token_result, temp_sent_idx])
    docid2temptokenizeinfor_dict[docid] = tokenize_infor_results
    
save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/3_Tempinfor_Tokenize/docid2temptokenizeinfor_dict.pickle"
pickle.dump(docid2temptokenizeinfor_dict, open(save_file, "wb"))

10000; 20000; 30000; 40000; 50000; 60000; 70000; 80000; 90000; 100000; 110000; 120000; 130000; 140000; 150000; 160000; 170000; 180000; 190000; 200000; 210000; 220000; 230000; 240000; 250000; 260000; 270000; 280000; 290000; 300000; 310000; 320000; 330000; 340000; 350000; 360000; 370000; 380000; 390000; 400000; 410000; 420000; 430000; 440000; 450000; 460000; 470000; 480000; 490000; 500000; 510000; 520000; 530000; 540000; 550000; 560000; 570000; 580000; 590000; 600000; 610000; 620000; 630000; 640000; 650000; 660000; 670000; 680000; 690000; 700000; 710000; 720000; 730000; 740000; 750000; 760000; 770000; 780000; 790000; 800000; 810000; 820000; 830000; 840000; 850000; 860000; 870000; 880000; 890000; 900000; 910000; 920000; 930000; 940000; 950000; 960000; 970000; 980000; 990000; 1000000; 1010000; 1020000; 1030000; 1040000; 1050000; 1060000; 1070000; 1080000; 1090000; 1100000; 1110000; 1120000; 1130000; 1140000; 1150000; 1160000; 1170000; 1180000; 1190000; 1200000; 1210000; 1220000; 1230000; 1

In [35]:
docid2sentidx2temptokenizeinfor_dict = dict()
for docid, temptokenizeinfor in docid2temptokenizeinfor_dict.items():
    docid2sentidx2temptokenizeinfor_dict[docid] = dict()
    for tempinfor in temptokenizeinfor:
        temp_text, temp_grad, token_result, temp_sent_idx = tempinfor
        if temp_sent_idx not in docid2sentidx2temptokenizeinfor_dict[docid]:
            docid2sentidx2temptokenizeinfor_dict[docid][temp_sent_idx] = []
        docid2sentidx2temptokenizeinfor_dict[docid][temp_sent_idx].append([temp_grad, token_result])
save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/3_Tempinfor_Tokenize/docid2sentidx2temptokenizeinfor_dict.pickle"
pickle.dump(docid2sentidx2temptokenizeinfor_dict, open(save_file, "wb"))

In [3]:
grad_temptokenset_dict = {"day":dict(), "month":dict(), "year":dict()}
for docid, temptokenizeinfor in docid2temptokenizeinfor_dict.items():
    for tempinfor in temptokenizeinfor:
        len_v = len(tempinfor[2])
        if len_v not in grad_temptokenset_dict[tempinfor[1]]:
            grad_temptokenset_dict[tempinfor[1]][len_v] = set()
        grad_temptokenset_dict[tempinfor[1]][len_v].add(tuple(tempinfor[2]))
        
save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/3_Tempinfor_Tokenize/gradlen_temptokenizationset_dict.pickle"
pickle.dump(grad_temptokenset_dict, open(save_file, "wb"))

In [4]:
for grad,len2temptokenset in grad_temptokenset_dict.items():
    sort_len = sorted(list(grad_temptokenset_dict[grad].keys()))
    for length in sort_len:
        if length==1:
            continue
        grad_temptokenset_dict[grad][length].update(grad_temptokenset_dict[grad][length-1])
save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/3_Tempinfor_Tokenize/new_gradlen_temptokenizationset_dict.pickle"
pickle.dump(grad_temptokenset_dict, open(save_file, "wb"))

In [39]:
len(grad_temptokenset_dict["day"][3])

4877

# Others

## Temporal_Information_Filtering (NER)

In [None]:
import glob
import pickle
import pandas as pd
import pyarrow.feather as feather

In [None]:
ranked_data_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/*.feather"):
    ranked_data_files_paths.append(file_name)
ranked_data_files_paths = sorted(ranked_data_files_paths)

ranked_entAndsent_files_paths = []
for file_name in glob.glob("0_Corpus/0_NYT_Data_Extraction/2_EntInfor_AND_SentIdxInfor_Files/*.feather"):
    ranked_entAndsent_files_paths.append(file_name)
ranked_entAndsent_files_paths = sorted(ranked_entAndsent_files_paths)    

ranked_titleinfor_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/3_EntInfor_AND_SentIdxInfor-Titles_Files/*.feather"):
    ranked_titleinfor_files_paths.append(file_name)
ranked_titleinfor_files_paths = sorted(ranked_titleinfor_files_paths)  

print(len(ranked_data_files_paths), len(ranked_entAndsent_files_paths), len(ranked_titleinfor_files_paths))

In [None]:
def return_filtered_temp_infor(text, sent_infor_list, ent_infor_list):
    newsentidx = -1
    sentidx2newsentidx = dict()
    sent_pos_list = []
    for sent_i,sent_infor in enumerate(sent_infor_list):  
        b_pos, e_pos = list(map(int, sent_infor.split("_X_")))
        sent_pos_list.append([b_pos, e_pos])
        line_text = text[b_pos:e_pos].strip()
        if len(line_text)!=0:
            newsentidx+=1
        sentidx2newsentidx[sent_i] = newsentidx
        
    filtered_sutimetemp_infor = []
    for entinfor in ent_infor_list:
        if entinfor.endswith('DATE'):
            temp_text, b_pos, e_pos, ent_type = entinfor.split("_X_")
            b_pos,e_pos = int(b_pos), int(e_pos)
            for sent_i,sent_pos in enumerate(sent_pos_list):
                if sent_pos[0]<=b_pos<=e_pos<=sent_pos[1]:
                    filtered_sutimetemp_infor.append([temp_text, [b_pos, e_pos], [sent_i, sentidx2newsentidx[sent_i]]])
                    break
    return filtered_sutimetemp_infor, [sent_pos_list,sentidx2newsentidx]

In [None]:
docid2tempinfor_dict = dict()
for file_idx, fpaths in enumerate(zip(ranked_data_files_paths, ranked_entAndsent_files_paths, ranked_titleinfor_files_paths)):
    data_file, entAndsent_file, titleinfor_file = fpaths
    text_data = feather.read_feather(data_file)
    entAndsent_data = feather.read_feather(entAndsent_file)
    titleinfor_data = feather.read_feather(titleinfor_file)
    print(data_file)
    
    text_data = text_data.rename(columns={'body_text': 'main_text', 'pub': 'date_publish', 'file_id': 'ID'})
    entAndsent_data = entAndsent_data.rename(columns={'ent_infor': 'main_ent_infor', 'sent_infor': 'main_sent_infor'})
    titleinfor_data = titleinfor_data.rename(columns={'ent_infor': 'title_ent_infor', 'sent_infor': 'title_sent_infor'})
    merged_pd = pd.merge(text_data, entAndsent_data, on='ID')
    merged_pd = pd.merge(merged_pd, titleinfor_data, on='ID')
    merged_pd = merged_pd[["main_text", "main_ent_infor", "main_sent_infor", "title_text", "title_ent_infor", "title_sent_infor", "date_publish", "ID"]]
    for row_idx, row in merged_pd.iterrows():
        if row_idx%10000==0:
            print(row_idx,end="; ")
        main_text = row["main_text"]
        text_sent_infor = row["main_sent_infor"]
        text_ent_infor = row["main_ent_infor"]
        title_text = row["title_text"]
        title_ent_infor = row["title_ent_infor"]
        title_sent_infor = row["title_sent_infor"]
        date_publish = row["date_publish"]
        docid = row["ID"]
        main_filtered_temp_infor, main_sent_pos_infor = return_filtered_temp_infor(main_text, text_sent_infor, text_ent_infor)
        title_filtered_temp_infor, title_sent_pos_infor = return_filtered_temp_infor(title_text, title_sent_infor, title_ent_infor)
        docid2tempinfor_dict[docid] = [main_filtered_temp_infor, main_sent_pos_infor, title_filtered_temp_infor, title_sent_pos_infor]

In [None]:
save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/2_Extracted_TempInfor/filtered_tempinfor_ner.pickle"
pickle.dump(docid2tempinfor_dict, open(save_file, "wb"))

## Temporal_Information_Tokenization

In [None]:
import pickle
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

docid2tempinfor_dict_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/2_Extracted_TempInfor/filtered_tempinfor_ner.pickle"
docid2tempinfor_dict = pickle.load(open(docid2tempinfor_dict_file,'rb'))
print(len(docid2tempinfor_dict))

In [None]:
docid2temptokenizeinfor_dict = dict()
row_idx = 0
for docid, tempinfor in docid2tempinfor_dict.items():
    row_idx+=1
    if row_idx%10000==0:
        print(row_idx,end="; ")
    main_temp_infor, main_sent_pos_infor, title_temp_infor, title_sent_pos_infor = tempinfor
    _,title_sentidx2newsentidx = title_sent_pos_infor
    title_sent_num = title_sentidx2newsentidx[len(title_sentidx2newsentidx)-1]+1
    tokenize_infor_results = []
    for temp_infor in title_temp_infor:
        temp_text = temp_infor[0]
        temp_sent_idx = temp_infor[2][1]
        token_result = tokenizer.encode(temp_text, add_special_tokens=False)
        tokenize_infor_results.append([token_result, temp_sent_idx])
    for temp_infor in main_temp_infor:
        temp_text = temp_infor[0]
        temp_sent_idx = temp_infor[2][1]+title_sent_num
        token_result = tokenizer.encode(temp_text, add_special_tokens=False)
        tokenize_infor_results.append([token_result, temp_sent_idx])
    docid2temptokenizeinfor_dict[docid] = tokenize_infor_results

In [None]:
save_file = f"./0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/3_Tempinfor_Tokenize/docid2temptokenizeinfor_ner_dict.pickle"
pickle.dump(docid2temptokenizeinfor_dict, open(save_file, "wb"))

# Analysis

In [1]:
import pickle

docid2sentidx2temptokenizeinfor_dict_file = '0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/3_Tempinfor_Tokenize/docid2sentidx2temptokenizeinfor_dict.pickle'
docid2sentidx2temptokenizeinfor_dict = pickle.load(open(docid2sentidx2temptokenizeinfor_dict_file,'rb'))
gradlen_temptokenizationset_dict_file = '0_Corpus/1_Pretraining_Preprocessing/3_Temporal_Information_Data/3_Tempinfor_Tokenize/gradlen_temptokenizationset_dict.pickle'
gradlen_temptokenizationset_dict = pickle.load(open(gradlen_temptokenizationset_dict_file,'rb'))
print(len(docid2sentidx2temptokenizeinfor_dict), len(gradlen_temptokenizationset_dict))

1652347 3
