### load all the processed preprint papers 

In [3]:
import os
import pickle

if os.path.exists('all_paper_info_lists.pkl'):
    # open the existing pickle file for reading
    with open('all_paper_info_lists.pkl', 'rb') as f:
        all_paper_lists = pickle.load(f)

### put title and abstract together, store in to string list

文本合并：标题 + 空格 + 摘要

大小写统一：全部转为小写字母

字符清洗：

去除换行符和连字符

处理德语变音符号（ä→ae, ö→oe, ü→ue）

移除所有引号和撇号

规范化空格（多个空格合并为单个空格）

In [4]:
def get_single_article_string(article):
    
    curr_title=article[1] #'title'
    curr_abstract=article[2] #'abstract'
    
    replace_pairs=[['\n',' '],['-',' '],[' \" a','oa'],['\" a','ae'],['\"a','ae'],[' \" o','oe'],['\" o','oe'],['\"o','oe'],[' \" u','ue'],
                   ['\" u','ue'],['\"u','ue'],[' \' a','a'],[' \' e','e'],[' \' o','o'],["\' ", ""],["\'", ""],['  ',' '],['  ',' ']]
    
    article_string=(curr_title +' '+ curr_abstract).lower()
    
    for rep_pair in replace_pairs:
        #print(rep_pair)
        
        article_string=article_string.replace(rep_pair[0],rep_pair[1])
        #print(article_string)
        #print('\n')
    
    return article_string

def get_all_paper_strings(article_lists):

    if os.path.exists('all_paper_string_lists.pkl'):
        with open("all_paper_string_lists.pkl", "rb") as f:
            all_paper_strings = pickle.load(f)
            
    else:
        all_paper_strings=[]
        cc=0
        for id_of_paper in range(len(article_lists)):
            cc+=1
            if (cc%300000)==0:
                print(str(cc)+'/'+str(len(article_lists)))

            all_paper_strings.append(get_single_article_string(article_lists[id_of_paper]))

        with open("all_paper_string_lists.pkl", "wb") as f:
            pickle.dump(all_paper_strings, f)
    
    return all_paper_strings



all_article_strings=get_all_paper_strings(all_paper_lists)

### Get Concepts from RAKE

nltk.corpus.stopwords.words('english') 需要下载停用词数据，但系统中没有这个资源。

在运行代码之前，先执行以下命令下载必要的NLTK数据：

In [5]:
import nltk

# 下载停用词数据
nltk.download('stopwords')

# 下载词形还原器需要的数据
nltk.download('wordnet')
nltk.download('omw-1.4')

# 下载分词需要的数据
nltk.download('punkt')

print("NLTK数据下载完成！")

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     UNEXPECTED_EOF_WHILE_READING] EOF occurred in
[nltk_data]     violation of protocol (_ssl.c:1000)>
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\张士涵\AppData\Roaming\nltk_data...
[nltk_data] Error downloading 'wordnet' from
[nltk_data]     <https://raw.githubusercontent.com/nltk/nltk_data/gh-
[nltk_data]     pages/packages/corpora/wordnet.zip>:   <urlopen error
[nltk_data]     [WinError 10054] 远程主机强迫关闭了一个现有的连接。>
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\张士涵\AppData\Roaming\nltk_data...


NLTK数据下载完成！


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\张士涵\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


RAKE算法需要NLTK的punkt分词器来分割句子，但缺少punkt_tab语言包。
下载所有必要的NLTK数据

In [6]:
import nltk

# 下载所有必要的NLTK数据包
required_packages = [
    'stopwords',      # 停用词
    'wordnet',        # 词形还原
    'omw-1.4',        # 多语言词网
    'punkt',          # 基础分词器
    'punkt_tab'       # 分词器语言包（解决当前错误）
]

print("开始下载NLTK必要数据包...")
for package in required_packages:
    try:
        nltk.download(package)
        print(f"已下载: {package}")
    except Exception as e:
        print(f"下载 {package} 失败: {e}")

print("NLTK数据下载完成！")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\张士涵\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\张士涵\AppData\Roaming\nltk_data...


开始下载NLTK必要数据包...
已下载: stopwords
已下载: wordnet


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\张士涵\AppData\Roaming\nltk_data...


KeyboardInterrupt: 

使用RAKE算法从所有论文文本中批量提取论文标题和摘要中的关键概念短语，使用停用词作为分隔符，将文本分割成候选短语

In [7]:
import time
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rake_nltk import Metric, Rake
from collections import Counter

starting_time = time.time()
 
wnl=WordNetLemmatizer()

num_of_abstracts=len(all_paper_lists)

personal_stop_list=['presents','us','show','one','two','three','describes','new','approach','many','introduces','http','also','whose', 'prove','select ','take']

nltk_stop_list=nltk.corpus.stopwords.words('english')
full_stop_list=nltk_stop_list + personal_stop_list


all_concepts_from_rake=[]
cc=0
for id_of_abstract in range(num_of_abstracts):
    cc+=1
    if (cc%100000)==0:
        print(str(cc)+'/'+str(num_of_abstracts))
    
            
    single_string = get_single_article_string(all_paper_lists[id_of_abstract])
    
    r = Rake(stopwords=full_stop_list, ranking_metric=Metric.WORD_DEGREE, min_length=2, include_repeated_phrases=False)

    r.extract_keywords_from_text(single_string)
    ll=r.get_ranked_phrases_with_scores()
    
    all_concepts_from_rake.extend(ll)


    


100000/2848279
200000/2848279
300000/2848279
400000/2848279
500000/2848279
600000/2848279
700000/2848279
800000/2848279
900000/2848279
1000000/2848279
1100000/2848279
1200000/2848279
1300000/2848279
1400000/2848279
1500000/2848279
1600000/2848279
1700000/2848279
1800000/2848279
1900000/2848279
2000000/2848279
2100000/2848279
2200000/2848279
2300000/2848279
2400000/2848279
2500000/2848279
2600000/2848279
2700000/2848279
2800000/2848279
