In [2]:
import os, json
from tqdm import tqdm

In [9]:
def clean_entities(L1):
    L1 = [item.strip() for item in L1] # Strip Whitespace
    # Create a set to track lowercase elements and a new list for unique items
    seen = set()
    L2 = []
    for item in L1:
        if len(item)>0 and item[0] in unwanted_characters: # Remove Unwanted Characters
            item = item.strip(" "+item[0]+" ")
        if len(item)>0 and item[-1] in unwanted_characters:
            item = item.strip(" "+item[-1]+" ")
        if len(item)==0: # Filter Empty Items
            continue
        lower_item = item.lower()
        if lower_item not in seen: # Remove Duplicates (Case-insensitive)
            L2.append(item)
            seen.add(lower_item)

    return L2

def get_entity_dictionary(root,ID):
    json_path = os.path.join(root,ID,f"{ID}.json")
    with open(json_path, 'r') as file:
        data = json.load(file)
    return data

def get_entities_from_relative_path(entity_dict,relative_path):
    # Body/Results/paragraph_6.txt, Abstract/paragraph_2.txt
    parts = relative_path.split("/")
    if "Body" in parts: 
        entities = entity_dict[parts[0]][parts[1]][parts[2][:-4]]
    else:
        entities = entity_dict[parts[0]][parts[1][:-4]]
    return entities

def find_missing_JSON(root):
    IDS_with_JSON, IDS_without_JSON = [],[] 
    papers = os.listdir(root)
    # len(papers) #4123
    for ID in papers:
        paper_path = os.path.join(root,ID)
        paper_files = os.listdir(paper_path)
        if f"{ID}.json" not in paper_files:
            IDS_without_JSON.append(ID)
        else:
            IDS_with_JSON.append(ID)
    return IDS_with_JSON, IDS_without_JSON

def get_abstract_paths(root,paper_id):
    '''
    Returns the paths of all abstract paragraphs
    '''
    abstract_path = os.path.join(root,paper_id,'Abstract')
    relative_abstract_path = os.path.join('Abstract')
    actual_listdir = [path_name for path_name in os.listdir(abstract_path) if "ipynb" not in path_name]
    num_papers = len(actual_listdir)
    para_paths = [os.path.join(abstract_path,f"paragraph_{i}.txt") for i in range(1,num_papers+1)]
    relative_para_paths = [os.path.join(relative_abstract_path,f"paragraph_{i}.txt") for i in range(1,num_papers+1)]
    # ['../OUTPUT_FOLDER/4156776/Abstract/paragraph_2.txt', '../OUTPUT_FOLDER/4156776/Abstract/paragraph_1.txt']
    return para_paths, relative_para_paths

def get_body_paths(root,paper_id):
    '''
    Returns the paths of all body paragraphs
    '''
    body_path = os.path.join(root,paper_id,'Body')
    relative_body_path = os.path.join('Body')
    
    section_names = os.listdir(body_path)
    section_paths = [os.path.join(body_path,i) for i in section_names]
    relative_section_paths = [os.path.join(relative_body_path,i) for i in section_names]
    
    para_paths, relative_para_paths = [], []
    for section, relative_section in zip(section_paths, relative_section_paths):
        actual_listdir = [path_name for path_name in os.listdir(section) if "ipynb" not in path_name]
        num_papers = len(actual_listdir)
        subsection_para_paths = [os.path.join(section,f"paragraph_{i}.txt") for i in range(1,num_papers+1)]
        relative_subsection_para_paths = [os.path.join(relative_section,f"paragraph_{i}.txt") for i in range(1,num_papers+1)]
        para_paths+=subsection_para_paths
        relative_para_paths+=relative_subsection_para_paths
    
    return para_paths,relative_para_paths

In [4]:
# unwanted_characters = list("+-=;;.,?|<>")
# original_list = ["autism","biological sex","autism","autism","autism","neurobiology",
#       "autism","biological base","autism","autism","biological sex",
#       "neuroanatomy","autism","autism","autism","adults","spatial",
#       "neuroanatomy",
#       "sexually dimorphic","neurobiology","autism","measurement","+",
#       "Alexithymia",
#       "+","= grey matter +","Autism","autism", "ACC = something"]
# clean_list = clean_entities(original_list)
# clean_list

In [7]:
'''
Below code takes a backup of all existing JSON files
'''
root = "../OUTPUT_FOLDER"
unwanted_characters = list("+-=;;.,?|<>")
# JSON_FOLDER = "ALL_JSON_files"
# os.makedirs(JSON_FOLDER, exist_ok=True)

IDS_with_JSON, IDS_without_JSON = find_missing_JSON(root)
IDS_without_JSON

# for ID in IDS_with_JSON[:2]:
#     entity_json = get_entity_dictionary(root,ID)
#     #print(entity_json)
#     with open(F"{ID}_A.json",'w') as f:
#         json.dump(entity_json,f)

['2229370', '6086934']

In [26]:
for ID in tqdm(IDS_with_JSON):
    try:
        entity_dict = get_entity_dictionary(root,ID) # original entities dictionary
        abstract_paths, relative_abstract_paths = get_abstract_paths(root,ID)
        body_paths, relative_body_paths = get_body_paths(root,ID)
        for relative_path in relative_abstract_paths:
            parts = relative_path.split("/")
            # print(f"\n#####START######\nCurrent relative path = {relative_path}\n\n")
            if "Body" in parts: 
                original_entities = entity_dict[parts[0]][parts[1]][parts[2][:-4]]
                # print(f"ORIGINAL ENTITIES = {original_entities}\n\n")
                cleaned_entities = clean_entities(original_entities)
                # print(f"CLEANED ENTITIES = {cleaned_entities}\n\n")
                entity_dict[parts[0]][parts[1]][parts[2][:-4]] = cleaned_entities
            else:
                original_entities = entity_dict[parts[0]][parts[1][:-4]]
                # print(f"ORIGINAL ENTITIES = {original_entities}\n\n")
                cleaned_entities = clean_entities(original_entities)
                # print(f"CLEANED ENTITIES = {cleaned_entities}\n\n*******END********")
                entity_dict[parts[0]][parts[1][:-4]] = cleaned_entities
                
        for relative_path in relative_body_paths:
            parts = relative_path.split("/")
            # print(f"\n#####START######\nCurrent relative path = {relative_path}\n\n")
            if "Body" in parts: 
                original_entities = entity_dict[parts[0]][parts[1]][parts[2][:-4]]
                # print(f"ORIGINAL ENTITIES = {original_entities}\n\n")
                cleaned_entities = clean_entities(original_entities)
                # print(f"CLEANED ENTITIES = {cleaned_entities}\n\n")
                entity_dict[parts[0]][parts[1]][parts[2][:-4]] = cleaned_entities
            else:
                original_entities = entity_dict[parts[0]][parts[1][:-4]]
                # print(f"ORIGINAL ENTITIES = {original_entities}\n\n")
                cleaned_entities = clean_entities(original_entities)
                # print(f"CLEANED ENTITIES = {cleaned_entities}\n\n*******END********")
                entity_dict[parts[0]][parts[1][:-4]] = cleaned_entities
                
        with open(F"CORRECT_JSON_FILES/{ID}_B.json",'w') as f:
            json.dump(entity_dict, f)
    except Exception as e:
        with open("errors_while_entity_cleaning.txt",'a') as file:
            file.write(f"{ID}|{e}\n")
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 107.32it/s]


In [27]:
curr = os.listdir("CORRECT_JSON_FILES")
IDS = [k.split("_")[0] for k in curr]
set1 = set(IDS)

In [28]:
orig = os.listdir(root)
set2 = set(orig)

In [29]:
set1.difference(set2), set2.difference(set1)

(set(), {'2229370', '6086934'})