## EntInfor Extraction  & SentIdxInfo Extraction (content)

**Data Size: 1652347**

In [1]:
import os
import glob
import pickle
import pandas as pd
import pyarrow.feather as feather
from collections import Counter
import spacy
import time
nlp = spacy.load("en_core_web_sm")
from multiprocessing import Pool, cpu_count
multi_process_num = cpu_count()

saved_feather_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/*.feather"):
    saved_feather_files_paths.append(file_name)
ranked_files_paths = []
for file_path in saved_feather_files_paths:
    f_idx = int(file_path.split("/")[-1].split("_")[0])
    ranked_files_paths.append([f_idx,file_path])
ranked_files_paths.sort(key = lambda x: x[0])
print(len(ranked_files_paths))

10


In [None]:
total_num=0
for file_idx, file_path in ranked_files_paths:
    save_file_name = f"{file_idx}_final_filtered_entinforANDsentidx.feather"
    save_file_path = f"0_Corpus/0_NYT_Data_Extraction/2_EntInfor_AND_SentIdxInfor_Files/{save_file_name}"    
    if os.path.exists(save_file_path):
        print(f"Have finished {file_idx}th Processing.")
        print("-----------------------------------------------------------------")
        continue
    filtered_pd = feather.read_feather(file_path)
    filtered_pd = filtered_pd.rename(columns={'file_id': 'ID', 'pub': 'date_publish', 'body_text': 'maintext'})
    total_num+=len(filtered_pd)
    text_list = filtered_pd["maintext"].tolist()
    ID_list = filtered_pd["ID"].tolist()
    print(f"Process {file_idx}th file. Data size:{len(filtered_pd),len(filtered_pd)//1000}")
    assert len(filtered_pd)==len(text_list)
    docs = nlp.pipe(text_list, n_process=multi_process_num, batch_size=500)
    detailed_ent_infor_list = []
    detailed_sent_infor_list = []
    for idx,d in enumerate(docs):
        if idx%1000==0:
            print(idx//1000,end="; ")
        #1.ent_infor extraction
        maintext = text_list[idx]
        ent_infor_list = d.ents
        detailed_ent_infor = []
        for ent_infor in ent_infor_list:
            ent_text = ent_infor.text
            ent_begin_pos = ent_infor[0].idx
            ent_end_pos = ent_infor[0].idx+len(ent_text)
            ent_type = ent_infor.label_
            assert maintext[ent_begin_pos:ent_end_pos]==ent_text
            ent_infor_l = [ent_text,str(ent_begin_pos),str(ent_end_pos),ent_type]
            detailed_ent_infor.append("_X_".join(ent_infor_l))            
        detailed_ent_infor_list.append(detailed_ent_infor)
        #2.sent_infor extraction
        sent_infor_list = d.sents
        detailed_sent_infor = []
        for sent_infor in sent_infor_list:
            sent_text = sent_infor.text
            sent_begin_pos = sent_infor[0].idx
            sent_end_pos = sent_infor[0].idx+len(sent_text)
            assert maintext[sent_begin_pos:sent_end_pos]==sent_text
            #sent_infor_l = [str(sent_begin_pos),str(sent_end_pos),sent_text]
            sent_infor_l = [str(sent_begin_pos),str(sent_end_pos)]
            detailed_sent_infor.append("_X_".join(sent_infor_l))
        detailed_sent_infor_list.append(detailed_sent_infor)
    
    assert len(detailed_ent_infor_list)==len(detailed_sent_infor_list)==len(ID_list)
    entinfor_sentinfor_list = []
    for entinfor, sentinfor, idinfor in zip(detailed_ent_infor_list,detailed_sent_infor_list,ID_list):
        entinfor_sentinfor_list.append([entinfor, sentinfor, idinfor])
    entinfor_sentinfor_pd = pd.DataFrame(entinfor_sentinfor_list,columns = ["ent_infor","sent_infor","ID"])
    feather.write_feather(entinfor_sentinfor_pd, save_file_path)

    print(f"Finish processing {len(entinfor_sentinfor_pd)} data.")
    print("-----------------------------------------------------------------")
print("total_num:",total_num)

### Analysis
**Data Size: 1652347**

In [1]:
import os
import glob
import pickle
import pandas as pd
import pyarrow.feather as feather
from collections import Counter

saved_feather_files_paths = []
for file_name in glob.glob("0_Corpus/0_NYT_Data_Extraction/2_EntInfor_AND_SentIdxInfor_Files/*.feather"):
    saved_feather_files_paths.append(file_name)
print(len(saved_feather_files_paths))

ranked_entinfor_files_paths = []
for file_path in saved_feather_files_paths:
    f_idx = int(file_path.split("/")[-1].split("_")[0])
    ranked_entinfor_files_paths.append([f_idx,file_path])
ranked_entinfor_files_paths.sort(key = lambda x: x[0])
print(len(ranked_entinfor_files_paths))

fileid_list = []
for file_path in ranked_entinfor_files_paths:
    news_pd = feather.read_feather(file_path[1])
    fileid_list.extend(news_pd["ID"].tolist())

10
10


In [2]:
print(len(set(fileid_list)))
news_pd.tail(3)

1652347


Unnamed: 0,ent_infor,sent_infor,ID
165238,"[American_X_42_X_50_X_NORP, European_X_315_X_3...","[0_X_272, 273_X_496, 496_X_497, 497_X_675, 676...",1265245
165239,"[Joyce Van Patten_X_229_X_245_X_PERSON, New Yo...","[0_X_109, 110_X_201, 202_X_399, 399_X_570, 571...",1614637
165240,"[Jed_X_119_X_122_X_PERSON, Rhys Ifans_X_161_X_...","[0_X_286, 287_X_384, 385_X_448, 449_X_581, 582...",1624014


## EntInfor Extraction  & SentIdxInfo Extraction (title)

**Data Size: 1652347**

In [1]:
import os
import glob
import pickle
import pandas as pd
import pyarrow.feather as feather
from collections import Counter
import spacy
import time
nlp = spacy.load("en_core_web_sm")
from multiprocessing import Pool, cpu_count
multi_process_num = cpu_count()

saved_feather_files_paths = []
for file_name in glob.glob(f"./0_Corpus/0_NYT_Data_Extraction/1_NYT_FilteredData_Files/*.feather"):
    saved_feather_files_paths.append(file_name)
ranked_files_paths = []
for file_path in saved_feather_files_paths:
    f_idx = int(file_path.split("/")[-1].split("_")[0])
    ranked_files_paths.append([f_idx,file_path])
ranked_files_paths.sort(key = lambda x: x[0])
print(len(ranked_files_paths))

10


In [2]:
total_num = 0
empty_title_num = 0
for file_idx, file_path in ranked_files_paths:
    save_file_name = f"{file_idx}_final_filtered_entinforANDsentidx-title.feather"
    save_file_path = f"0_Corpus/0_NYT_Data_Extraction/3_EntInfor_AND_SentIdxInfor-Titles_Files/{save_file_name}"    
    if os.path.exists(save_file_path):
        print(f"Have finished {file_idx}th Processing.")
        print("-----------------------------------------------------------------")
        continue
    filtered_pd = feather.read_feather(file_path)
    filtered_pd = filtered_pd.rename(columns={'file_id': 'ID', 'pub': 'date_publish', 'body_text': 'maintext'})
    total_num+=len(filtered_pd)
    text_list = []
    for r in filtered_pd["title"].tolist():
        if len(r)==0:
            text_list.append("")
            empty_title_num+=1
            continue
        text_list.append(r+".\n")
    ID_list = filtered_pd["ID"].tolist()
    print(f"Process {file_idx}th file. Data size:{len(filtered_pd),len(filtered_pd)//1000}")
    assert len(filtered_pd)==len(text_list)
    docs = nlp.pipe(text_list, n_process=multi_process_num, batch_size=500)
    title_token_num = []
    detailed_ent_infor_list = []
    detailed_sent_infor_list = []
    for idx,d in enumerate(docs):
        if idx%1000==0:
            print(idx//1000,end="; ")
        #1.ent_infor extraction
        title_token_num.append(len(d))
        maintext = text_list[idx]
        ent_infor_list = d.ents
        detailed_ent_infor = []
        for ent_infor in ent_infor_list:
            ent_text = ent_infor.text
            ent_begin_pos = ent_infor[0].idx
            ent_end_pos = ent_infor[0].idx+len(ent_text)
            ent_type = ent_infor.label_
            assert maintext[ent_begin_pos:ent_end_pos]==ent_text
            ent_infor_l = [ent_text,str(ent_begin_pos),str(ent_end_pos),ent_type]
            detailed_ent_infor.append("_X_".join(ent_infor_l))            
        detailed_ent_infor_list.append(detailed_ent_infor)
        #2.sent_infor extraction
        sent_infor_list = d.sents
        detailed_sent_infor = []
        for sent_infor in sent_infor_list:
            sent_text = sent_infor.text
            sent_begin_pos = sent_infor[0].idx
            sent_end_pos = sent_infor[0].idx+len(sent_text)
            assert maintext[sent_begin_pos:sent_end_pos]==sent_text
            #sent_infor_l = [str(sent_begin_pos),str(sent_end_pos),sent_text]
            sent_infor_l = [str(sent_begin_pos),str(sent_end_pos)]
            detailed_sent_infor.append("_X_".join(sent_infor_l))
        detailed_sent_infor_list.append(detailed_sent_infor)
    
    assert len(detailed_ent_infor_list)==len(detailed_sent_infor_list)==len(ID_list)
    entinfor_sentinfor_list = []
    for title_text,entinfor, sentinfor, t_num, idinfor in zip(text_list,detailed_ent_infor_list,detailed_sent_infor_list,title_token_num,ID_list):
        entinfor_sentinfor_list.append([title_text,entinfor, sentinfor, t_num, idinfor])
    entinfor_sentinfor_pd = pd.DataFrame(entinfor_sentinfor_list,columns = ["title_text","ent_infor","sent_infor","token_num","ID"])
    feather.write_feather(entinfor_sentinfor_pd, save_file_path)
    print(f"Finish processing {len(entinfor_sentinfor_pd)} data.")
    print("-----------------------------------------------------------------")
print("total_num:",total_num)
print("empty_title_num:",empty_title_num)

Process 0th file. Data size:(165234, 165)
0; 1; 2; 3; 4; 5; 6; 7; 8; 9; 10; 11; 12; 13; 14; 15; 16; 17; 18; 19; 20; 21; 22; 23; 24; 25; 26; 27; 28; 29; 30; 31; 32; 33; 34; 35; 36; 37; 38; 39; 40; 41; 42; 43; 44; 45; 46; 47; 48; 49; 50; 51; 52; 53; 54; 55; 56; 57; 58; 59; 60; 61; 62; 63; 64; 65; 66; 67; 68; 69; 70; 71; 72; 73; 74; 75; 76; 77; 78; 79; 80; 81; 82; 83; 84; 85; 86; 87; 88; 89; 90; 91; 92; 93; 94; 95; 96; 97; 98; 99; 100; 101; 102; 103; 104; 105; 106; 107; 108; 109; 110; 111; 112; 113; 114; 115; 116; 117; 118; 119; 120; 121; 122; 123; 124; 125; 126; 127; 128; 129; 130; 131; 132; 133; 134; 135; 136; 137; 138; 139; 140; 141; 142; 143; 144; 145; 146; 147; 148; 149; 150; 151; 152; 153; 154; 155; 156; 157; 158; 159; 160; 161; 162; 163; 164; 165; Finish processing 165234 data.
-----------------------------------------------------------------
Process 1th file. Data size:(165234, 165)
0; 1; 2; 3; 4; 5; 6; 7; 8; 9; 10; 11; 12; 13; 14; 15; 16; 17; 18; 19; 20; 21; 22; 23; 24; 25; 26; 2

In [3]:
entinfor_sentinfor_pd.head(3)

Unnamed: 0,title_text,ent_infor,sent_infor,token_num,ID
0,Fugitive Is Arrested Near Rome.\n,[],"[0_X_31, 31_X_32]",7,1144727
1,Dukakis's Record: A Success Story.\n,"[Dukakis's Record_X_0_X_16_X_ORG, Success Stor...","[0_X_34, 34_X_35]",9,137146
2,Woe to Those Displaced by China Dam Project.\n,[China_X_26_X_31_X_GPE],"[0_X_44, 44_X_45]",10,823851


### Analysis
**Data Size: 1652347**

In [1]:
import os
import glob
import pickle
import pandas as pd
import pyarrow.feather as feather
from collections import Counter

saved_feather_files_paths1 = []
for file_name in glob.glob("0_Corpus/0_NYT_Data_Extraction/3_EntInfor_AND_SentIdxInfor-Titles_Files/*.feather"):
    saved_feather_files_paths1.append(file_name)
saved_feather_files_paths1 = sorted(saved_feather_files_paths1)
saved_feather_files_paths2 = []
for file_name in glob.glob("0_Corpus/0_NYT_Data_Extraction/2_EntInfor_AND_SentIdxInfor_Files/*.feather"):
    saved_feather_files_paths2.append(file_name)
saved_feather_files_paths2 = sorted(saved_feather_files_paths2)
print(len(saved_feather_files_paths1), len(saved_feather_files_paths2))

fileid_list = []
token_num_list = []
for path1,path2 in zip(saved_feather_files_paths1,saved_feather_files_paths2):
    news_pd1 = feather.read_feather(path1)
    news_pd2 = feather.read_feather(path2)
    assert len(news_pd1)==len(news_pd2)
    lenth1 = len(set(news_pd1["ID"].tolist())-set(news_pd2["ID"].tolist()))
    lenth2 = len(set(news_pd2["ID"].tolist())-set(news_pd1["ID"].tolist()))
    fileid_list.extend(news_pd1["ID"].tolist())
    token_num_list.extend(news_pd1["token_num"].tolist())
    assert lenth1==lenth2==0
print(len(set(fileid_list)),len(token_num_list))

10 10
1652347 1652347
