In [1]:
import pandas as pd
import time
import sys
import spacy
from datetime import datetime
import os

In [None]:
# example of generating 12m-13m data
skip_rows = 12000001 
n_rows_to_read = 1000000 

start_time = time.time()
column_names=['PMID', 'title','abstract']
# In total 18.8m rows，can be downloaded from:
# http://abel.lis.illinois.edu/data/abstracts2018.tsv.gz
abs_df = pd.read_csv('abstracts2018.tsv.gz',compression='gzip', sep='\t',on_bad_lines='warn',nrows=n_rows_to_read, skiprows=skip_rows,names=column_names)
end_time = time.time()
print(f"processing time：{end_time - start_time} seconds")

In [None]:
import spacy
import pandas as pd
import time
import sys

nlp = spacy.load("en_core_web_sm")

count = 0
temp,start_time=time.time(),time.time()

results = []

for index, row in abs_df.iterrows():
    
    if count % 10000 == 0:
        cost = time.time() - temp
        now = datetime.now().strftime("%H:%M:%S")
        print(f"Current time is: {now}. {count//10000}% done. {count} rows processed, time taken: {round(cost)} seconds. Size of data is {round(sys.getsizeof(results)/1024/1024,2)}MB")
        temp = time.time()
    
    if isinstance(row['abstract'], str):
        doc = nlp(row['abstract'])
        sentences = list(doc.sents)  
        total_sentences = len(sentences)
        
        for i, sent in enumerate(sentences):
            gpe_entities = [ent.text for ent in sent.ents if ent.label_ == "GPE"]
            if gpe_entities:
                results.append({
                    'PMID': row['PMID'],
                    'total_sentences': total_sentences,
                    'sentence_order': i + 1,
                    'sentence': sent.text,
                    'candidates': gpe_entities
                })
    else:
        print(f"Non-string value encountered at index {count}: {row['abstract']}")
    count += 1

result_df = pd.DataFrame(results)

end_time = time.time()
print(f"Processing completed in {round((end_time - start_time)/3600,2)} hours.")
result_df.to_excel('new_candidates_12m_13m.xlsx')

In [None]:
import pandas as pd

directory = ''

# combine all generated data
# replace the next line by your file names
file_order = ['new_candidates_0-1m.xlsx', 'new_candidates_1m-2m5.xlsx', 'new_candidates_2m5-4m.xlsx', 'new_candidates_4m-6m.xlsx', 'new_candidates_6m-7m5.xlsx', 'new_candidates_7m5-9m.xlsx', 'new_candidates_9m-10m.xlsx', 'new_candidates_10m-11m.xlsx', 'new_candidates_11m-12m.xlsx', 'new_candidates_12m-13m.xlsx', 'new_candidates_13m_14m.xlsx', 'new_candidates_14m-15m5_1.xlsx', 'new_candidates_14m-15m5_2.xlsx', 'new_candidates_15m5-17m_1.xlsx', 'new_candidates_15m5-17m_2.xlsx', 'new_candidates_17m-18m.xlsx', 'new_candidates_18m-19m.xlsx']

all_data = pd.DataFrame()

for filename in file_order:
    print(filename)
    file_path = os.path.join(directory, filename)
    df = pd.read_excel(file_path)
    all_data = pd.concat([all_data, df], ignore_index=True)
if "Unnamed: 0" in all_data.columns:
    all_data.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
all_data.to_csv('all_candidates.tsv', sep='\t', index=False)

In [None]:
all_data=pd.read_csv('all_candidates.txt', sep='\t')

In [None]:
# 2014-2018 after 7063577 PMID=24267737 
all_data=all_data[7063577:].reset_index(drop=True)
all_data

In [None]:
import stanza
import nltk
nltk.download('averaged_perceptron_tagger')


stanza.download('en')  
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,ner')

nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')

In [None]:
def extract_entities_stanza(sentence):
    doc = nlp_stanza(sentence)
    return [ent.text for sent in doc.sentences for ent in sent.ents if ent.type == 'GPE']

def extract_entities_nltk(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)
    chunks = nltk.ne_chunk(tags)
    return [' '.join(leaf[0] for leaf in c.leaves()) for c in chunks if hasattr(c, 'label') and c.label() == 'GPE']

def extract_entities_transformers(sentence):
    entities = nlp_transformers(sentence)
    return [entity['word'] for entity in entities if entity['entity'] == 'I-LOC' or entity['entity'] == 'B-LOC']

def extract_entities_flair(sentence):
    sentence = Sentence(sentence)
    tagger.predict(sentence)
    return [entity.text for entity in sentence.get_spans('ner') if entity.tag == 'LOC']

In [None]:
import pandas as pd
import time


test_data = pd.DataFrame(index=all_data.index)

test_data = all_data[2500000:3500000].copy()
test_data = test_data.dropna(subset=['sentence'])

test_data = test_data.reset_index(drop=True)
test_data[test_data["sentence"].isna()]

In [None]:
for column in ['stanza_candidates']:
    test_data[column] = None  #

def process_data(source_data, target_data):
    total_rows = len(source_data)

    for index, row in tqdm(source_data.iterrows(), total=total_rows, desc="Processing"):
        sentence = row['sentence']
        target_data.at[index, 'stanza_candidates'] = extract_entities_stanza(sentence)

    return target_data

test_data = process_data(test_data, test_data)

In [None]:
import ast
import pandas as pd
from difflib import SequenceMatcher
from tqdm import tqdm

def find_longest_common_substring(str1, str2):
    """ Find the longest common substring between two strings. """
    sequence_matcher = SequenceMatcher(None, str1, str2)
    match = sequence_matcher.find_longest_match(0, len(str1), 0, len(str2))
    if match.size == 0:
        return None
    return str1[match.a: match.a + match.size]

def common_substring_in_lists(list1, list2):
    """ Return common substrings between two lists of strings. """
    common_substrings = set()
    for str1 in list1:
        for str2 in list2:
            substring = find_longest_common_substring(str1, str2)
            if substring:
                common_substrings.add(substring)
    return list(common_substrings)

def process_row(row):
    candidates = ast.literal_eval(row['candidates']) if isinstance(row['candidates'], str) else row['candidates']
    stanza_candidates = ast.literal_eval(row['stanza_candidates']) if isinstance(row['stanza_candidates'], str) else row['stanza_candidates']

    # Finding common entities between candidates and stanza_candidates (bagging_result1)
    bagging_result1 = common_substring_in_lists(candidates, stanza_candidates)

    return pd.Series({'bagging_result1': bagging_result1})

# Assuming test_data is a DataFrame with the mentioned columns.
# Update the DataFrame by applying the function to each row
with tqdm(total=len(test_data), desc="Processing rows") as pbar:
    results = test_data.apply(lambda row: process_row(row), axis=1)
    test_data = test_data.join(results)
    pbar.update(1)

print("Processing complete!")


In [None]:
import pandas as pd

def filter_results(results):
    filtered_results = [result for result in results if len(result) > 1 and any(c.isupper() for c in result)]
    return filtered_results

test_data['bagging_result_filter'] = test_data['bagging_result1'].apply(filter_results)

In [None]:
from tqdm import tqdm  #
import pandas as pd
import ast


def safe_eval_literal(string):
    try:
        return ast.literal_eval(string)
    except (ValueError, SyntaxError):
        return []  

def split_rows(row):
    if type(row['bagging_result_filter'])==list:
        bagging_result_filter = row['bagging_result_filter']
    else:
        bagging_result_filter = safe_eval_literal(row['bagging_result_filter'])  
        
    if type(row['candidates'])==list:
        candidates = row['candidates']
    else:
        candidates = safe_eval_literal(row['candidates'])
        
  
    category = 1 if bagging_result_filter else 0
    items_to_split = bagging_result_filter if bagging_result_filter else candidates
    
    new_rows = []
    if items_to_split:
        for item in items_to_split:
            new_row = row.to_dict()
            new_row['category'] = category
            new_row['bagging_result_filter'] = [item] if category else new_row['bagging_result_filter']
            new_row['candidates'] = [item] if not category else new_row['candidates']
            new_rows.append(new_row)
    else:
        new_row = row.to_dict()
        new_row['category'] = category
        new_rows.append(new_row)
    
    return new_rows

expanded_data = []
for _, row in tqdm(test_data.iterrows(), total=test_data.shape[0], desc="Processing Rows"):
    expanded_data.extend(split_rows(row))

expanded_df = pd.DataFrame(expanded_data)

expanded_df.reset_index(drop=True, inplace=True)

expanded_df.head()


In [None]:
def create_detected_candidate(row):
    candidate = row['bagging_result_filter'] if row['category'] == 1 else row['candidates']
    return ' '.join(candidate) if isinstance(candidate, list) else candidate

expanded_df['detected_candidate'] = expanded_df.apply(create_detected_candidate, axis=1)

def tag_sentence(row):
    sentence = row['sentence']
    candidate = row['detected_candidate']
    tagged_sentence = sentence.replace(candidate, f"[locB] {candidate} [locE]")
    return tagged_sentence

expanded_df['tagged_sentences'] = expanded_df.apply(tag_sentence, axis=1)

In [None]:
expanded_df.to_csv('test_bagging_res_tagged.tsv', sep='\t', index=False)