##### improve domain concepts 

这段代码对整合后的领域概念库进行了最终的精炼和优化。首先加载并去重所有概念，然后应用多层次过滤规则：去除包含通用词汇（如"experiment"、"software"）的概念，过滤特定开头词（如"sophisticated"、"techniques"）和结尾词（如"illustrates"、"configuration"），根据概念长度应用差异化过滤条件，并移除包含无意义短语组合的概念。最后删除原始文件，将优化后的高质量概念库重新保存为文本和PKL格式，确保最终概念集合的专业性和实用性。

输入concept_seperate/all_concepts.pkl，包含所有合并后的概念列表，从34个分块概念文件整合而来。
all_concepts.pkl 
    → 去重 → modify_full_concept_list 
    → 多层过滤 → improve_full_concept_list 
    → 输出到 full_domain_concepts.txt 和 improved_concepts_form_openalex.pkl

In [1]:
import pickle
import os
import time
from datetime import datetime, date
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rake_nltk import Metric, Rake
from collections import Counter
import re
from nltk.corpus import wordnet
import random


##### store

In [2]:
concept_folder="concept_seperate"
## finish all 
all_concepts_file = os.path.join(concept_folder,'all_concepts.pkl')  # edges
with open(all_concepts_file, "rb") as output_file:
    all_concepts=pickle.load(output_file)
    
## remove repeated concepts
unique_concepts = list(set(all_concepts))
concepts_file='full_domain_concepts.txt' # rename 'full_concepts_form_openalex.txt'
f = open(concepts_file, "a")
for ii in range(len(unique_concepts)):
    f.write(unique_concepts[ii]+'\n')
f.close()

##### read the concepts file

In [3]:
 
if os.path.exists(concepts_file):
    # open the existing file for reading   
    with open(concepts_file, "r") as f:
        modify_full_concept_list = [line.rstrip() for line in f.readlines()]
    
    now_time = datetime.now()
    formatted_time = now_time.strftime("%d-%m-%Y %H:%M:%S")
    print("{}; Concepts: {:d} ".format(formatted_time,len(modify_full_concept_list)))

06-12-2025 13:09:04; Concepts: 45761 


##### filter concepts

In [4]:

starting_time = time.time()

filter_concept_any=['held','equal','dramatic','slowing','excited','occupied','charged','moving','layer','bi','argument','intuition','experiment','entirely','essentially','built','necessary','take','applicable','employ','visit','visited','herein','facilitates','varying','overlapping','addressed','issues','related','add','adds','dominant','preserve','preserves','preserved','stabilizing','match','manipulating','emerging','processed','data','continuously','analytically','argue','smoothly','connect','connects','connecting','software','matlab','toolbox','standard','industrial','technology','success','equipment','call','analogous','sense','persist','persists','throughout','calculated','useful','difficult','proved']

filter_concept_start=['sophisticated','precise','remarkably','consists','gradually','simplified','complete','techniques','partially','presented','iterative','simple','preparation','clear','priori','ae','substantial','sending','protecting','optimized','optimize','optimizing','transmits','transmit','transmitting','transmitted','processing','pre','collect','collected','measured','varied','operating','algorithms','algorithm','robustly','shall','concept','packing','successful','apparent','apparently','readily','adapted','todays','imperfect','seemingly','seeming','shelf','properties','mechanism','phenomenon','behavior','theorem','procedure','usual','form','later','calculating','fundamentally']

filter_concept_end=['illustrates','setup','consisting','set','capable','configuration','complete','borrowed','permit','utilizes','referred','refer','capable','pave','stem','preparation','scheme','optimizes','transmitted','transmit','operating','relate','packed','packing','platform','industry','adapt','adapts','adapted','arrangement','era','device','arrange','arranged','content','procedure','outlined','form','formed','followed','following','calculation']


concept_to_remove_pair=['self']
concept_to_keep_pair=['stabilization']

conditioned_filter_concept_any5=['open']
conditioned_filter_concept_any3=['driven','component']
conditioned_filter_concept_any2=['probe','inspired','technique','open','added','transfer','connected','element','exchange']

conditioned_filter_concept_start2=['doubly','probe']
conditioned_filter_concept_end2=[]

forbidden_continued_strings=['complete measurement','exact numerical','numerical technique','numerical method','complete set','pure entangled','quantum entangled','high fidelity']

improve_full_concept_list=[]

for one_concept in modify_full_concept_list:
    
    separated_words=one_concept.split()
    do_remove=0
    for word in separated_words:
        if word in filter_concept_any:
            do_remove=1
            break
        
        if len(separated_words)<5: ## only for 5 words
            if word in conditioned_filter_concept_any5:
                do_remove=1
                break

            if len(separated_words)<=3:
                if word in conditioned_filter_concept_any3:
                    do_remove=1
                    break
                
                if len(separated_words)==2: ## only for 2 words
                    if word in conditioned_filter_concept_any2:
                        do_remove=1
                        break

             
    
    if separated_words[0] in filter_concept_start:
            do_remove=1
    if separated_words[-1] in filter_concept_end:
            do_remove=1
                
    if len(separated_words)==2:
        if separated_words[0] in conditioned_filter_concept_start2: #check the start word 
            do_remove=1
        if separated_words[-1] in conditioned_filter_concept_end2: #check the last word 
            do_remove=1

    if do_remove==0:
        for word in forbidden_continued_strings:
            if word in one_concept:
                do_remove=1
                break

    if do_remove==0:
        improve_full_concept_list.append(one_concept)
        
print("Concepts: {:d} ; Store: {:d}; Remove: {:d} ".format(len(modify_full_concept_list), len(improve_full_concept_list),len(modify_full_concept_list)-len(improve_full_concept_list)))
elapsed_time = time.time() - starting_time
print("Elapsed time: {:.2f} seconds".format(elapsed_time))

now_time =  datetime.now()
formatted_time = now_time.strftime("%d-%m-%Y %H:%M:%S")
print("{}; Concepts: {:d} ".format(formatted_time,len(improve_full_concept_list)))

Concepts: 45761 ; Store: 41409; Remove: 4352 
Elapsed time: 0.15 seconds
06-12-2025 13:09:12; Concepts: 41409 


##### restore the file

In [5]:

# Delete the orginal txt and re-create a new one with the improved concepts 
if os.path.exists(concepts_file):
    os.remove(concepts_file)
    print("txt has been deleted.")

    # re-Create the text file  
    f = open(concepts_file, "a")
    for ii in range(len(improve_full_concept_list)):
        f.write(improve_full_concept_list[ii]+'\n')
    f.close()
    print("re-create text and store information.")  
else:
    f = open(concepts_file, "a")
    for ii in range(len(improve_full_concept_list)):
        f.write(improve_full_concept_list[ii]+'\n')
    f.close()
    print("create text and store information.")
    
now_time = datetime.now()
formatted_time = now_time.strftime("%d-%m-%Y %H:%M:%S")
print("{}; Concepts: {:d} ".format(formatted_time,len(improve_full_concept_list)))


txt has been deleted.
re-create text and store information.
06-12-2025 13:09:19; Concepts: 41409 


##### additionally, store a pkl file (as a backup)

In [6]:
concepts_path_pkl='improved_concepts_form_openalex.pkl'
with open(concepts_path_pkl, "wb") as output_file:
    pickle.dump(improve_full_concept_list, output_file)