这段代码从arXiv数据集中筛选出CS.cl领域的论文，提取标题、摘要和发布时间等核心信息，并进行文本清洗和标准化处理。最终将处理后的数据保存为PKL和JSON两种格式，为后续的领域概念挖掘和文本分析提供标准化的数据集。

In [1]:
import json
import linecache
import time
import jsonlines
from datetime import datetime, date
import pickle
import os

In [2]:
starting_date = date(1990,1,1)
start_time = time.time()

#arxiv_folder='arxiv-snapshot'
arxiv_folder="E:\\study\\research\\create_gynamic_edge\\concept corpus"
arxiv_json = os.path.join(arxiv_folder,"arxiv-metadata-oai-snapshot.json")

arxiv_cs_original=[]
arxiv_cs_modified=[]

with jsonlines.open(arxiv_json, 'r') as f:
    for id_of_abstract, line in enumerate(f):
        if line['categories'] in ['cs.CL']:
            arxiv_cs_original.append(line)  ## store the original one
        
            get_date = datetime.strptime(line['versions'][0]['created'], '%a, %d %b %Y %H:%M:%S %Z').date()
            paper_time = (get_date - starting_date).days
            arxiv_cs_modified.append([line['categories'],line['title'],line['abstract'],paper_time]) ## store modified one
        
elapsed_time = time.time() - start_time
print(f"sub arxiv: {id_of_abstract}")
print(f"cs.CL: {len(arxiv_cs_original)}; Modified: {len(arxiv_cs_modified)}, Elapsed time: {elapsed_time} seconds")

sub arxiv: 2848278
cs.CL: 33892; Modified: 33892, Elapsed time: 33.86599063873291 seconds


In [4]:

# 保存为pkl文件
with open('arxiv_cs_style_modified.pkl', 'wb') as pkl_file:
    pickle.dump(arxiv_cs_modified, pkl_file)

# 保存为json文件
with open('arxiv_cs_original.json', 'w', encoding='utf-8') as json_file:
    json.dump(arxiv_cs_original, json_file, ensure_ascii=False, indent=2)


print(f"Files saved: arxiv_cs_modified.pkl and arxiv_cs_original.json")

Files saved: arxiv_cs_modified.pkl and arxiv_cs_original.json


## make only strings (title+abstract)

In [5]:
# ## (Read the modified metadata; [source, title, abstract, time])
### (Make each article in string, under certain replacements)

def get_single_article_string(article):
    
    curr_title=article[1] #'title'
    curr_abstract=article[2] #'abstract'
    
    replace_pairs=[['\n',' '],['-',' '],[' \" a','oa'],['\" a','ae'],['\"a','ae'],[' \" o','oe'],['\" o','oe'],['\"o','oe'],[' \" u','ue'],
                   ['\" u','ue'],['\"u','ue'],[' \' a','a'],[' \' e','e'],[' \' o','o'],["\' ", ""],["\'", ""],['  ',' '],['  ',' ']]
    
    article_string=(curr_title +' '+ curr_abstract).lower()
    
    for rep_pair in replace_pairs:
        #print(rep_pair)
        
        article_string=article_string.replace(rep_pair[0],rep_pair[1])
        #print(article_string)
        #print('\n')
    
    return article_string


def get_all_paper_strings(article_lists, folder_file):

    if os.path.exists(os.path.join(folder_file,'arxiv_cs_paper_strings.pkl')):
        with open(os.path.join(folder_file,'arxiv_cs_paper_strings.pkl'), "rb") as f:
            arxiv_cs_paper_strings = pickle.load(f)
            
    else:
        all_paper_strings=[]
        cc=0
        for id_of_paper in range(len(article_lists)):
            cc+=1
            #if (cc%3000)==0:
                #print(str(cc)+'/'+str(len(article_lists)))

            all_paper_strings.append(get_single_article_string(article_lists[id_of_paper]))

        with open(os.path.join(folder_file,'arxiv_cs_paper_strings.pkl'), "wb") as f:
            pickle.dump(all_paper_strings, f)
    
    return all_paper_strings    


all_article_strings=get_all_paper_strings(arxiv_cs_modified,folder_file="E:\\study\\research\\create_gynamic_edge\\domain concepts")