In [1]:
import pandas as pd
import re

# Load CSV data
df = pd.read_csv("collection_with_abstracts.csv")

# Define keyword categories based on provided terms
virology_terms = r"(virology|epidemiology)"
nn_terms = r"(neural network|artificial neural network|machine learning model|feedforward neural network|neural net algorithm|multilayer perceptron|convolutional neural network|recurrent neural network|long short-term memory network|CNN|GRNN|RNN|LSTM)"
deep_learning_terms = r"(deep learning|deep neural networks)"
computer_vision_terms = r"(computer vision|vision model|image processing|vision algorithms|computer graphics and vision|object recognition|scene understanding)"
text_mining_terms = r"(natural language processing|text mining|NLP|computational linguistics|language processing|text analytics|textual data analysis|text data analysis|text analysis|speech and language technology|language modeling|computational semantics)"
generative_ai_terms = r"(generative artificial intelligence|generative AI|generative deep learning|generative models)"
transformer_terms = r"(transformer models|self-attention models|transformer architecture|transformer|attention-based neural networks|transformer networks|sequence-to-sequence models)"
large_language_model_terms = r"(large language model|LLM|transformer-based model|pretrained language model|generative language model|foundation model|state-of-the-art language model)"
multimodal_terms = r"(multimodal model|multimodal neural network|vision transformer|diffusion model|generative diffusion model|diffusion-based generative model|continuous diffusion model)"

# Combine Title and Abstract columns into a single text column for filtering
df['Combined_Text'] = df['Title'].fillna('') + ' ' + df['Abstract'].fillna('')

def is_relevant(abstract):
    return bool(re.search(virology_terms, abstract, re.IGNORECASE)) and (bool(re.search(nn_terms, abstract, re.IGNORECASE)) or bool(re.search(deep_learning_terms, abstract, re.IGNORECASE)) or bool(re.search(computer_vision_terms, abstract, re.IGNORECASE)) or bool(re.search(text_mining_terms, abstract, re.IGNORECASE)) or bool(re.search(generative_ai_terms, abstract, re.IGNORECASE)) or bool(re.search(transformer_terms, abstract, re.IGNORECASE)) or bool(re.search(large_language_model_terms, abstract, re.IGNORECASE)) or bool(re.search(multimodal_terms, abstract, re.IGNORECASE)))

# Apply relevance check
df['Relevant'] = df['Combined_Text'].apply(lambda x: is_relevant(str(x)))

# Filter only relevant rows
relevant_df = df[df['Relevant'] == True].copy()

# Classify relevant papers by method type
def classify_method(text):
    """
    Classifies each relevant paper based on whether it mentions text mining, computer vision, both, or other.
    """
    has_text_mining = bool(re.search(text_mining_terms, text, re.IGNORECASE))
    has_computer_vision = bool(re.search(computer_vision_terms, text, re.IGNORECASE))

    if has_text_mining and has_computer_vision:
        return "both"
    elif has_text_mining:
        return "text mining"
    elif has_computer_vision:
        return "computer vision"
    else:
        return "other"

# Apply classification to the relevant subset
relevant_df['Method Type'] = relevant_df['Combined_Text'].apply(lambda x: classify_method(str(x)))

# Extract deep learning methods for each relevant paper
def extract_methods(text):
    """
    Extracts specific deep learning methods mentioned in the paper, based on keywords.
    """
    all_terms = nn_terms + '|' + deep_learning_terms + '|' + computer_vision_terms + '|' + text_mining_terms + '|' + generative_ai_terms + '|' + transformer_terms + '|' + large_language_model_terms + '|' + multimodal_terms
    methods = [method for match in re.findall(all_terms, text, re.IGNORECASE) for method in match if method]
    return ", ".join(set(methods))  # Remove duplicates and join

# Apply method extraction to each relevant paper
relevant_df['Extracted Methods'] = relevant_df['Combined_Text'].apply(lambda x: extract_methods(str(x)))

# Optionally, save the filtered and classified data to a CSV file
relevant_df.to_excel("keyword_filtered_papers.xlsx", index=False)
relevant_df #7610, 7468, 507

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Abstract,Combined_Text,Relevant,Method Type,Extracted Methods
67,38721977,Chest CT-based automated vertebral fracture as...,"Nadeem SA, Comellas AP, Regan EA, Hoffman EA, ...",Med Phys. 2024 Jun;51(6):4201-4218. doi: 10.10...,Nadeem SA,Med Phys,2024,2024/05/09,,,10.1002/mp.17072,BACKGROUND: Spinal degeneration and vertebral ...,Chest CT-based automated vertebral fracture as...,True,other,deep learning
91,38473002,Advancements in Glaucoma Diagnosis: The Role o...,"Bragança CP, Torres JM, Macedo LO, Soares CPA.",Diagnostics (Basel). 2024 Mar 1;14(5):530. doi...,Bragança CP,Diagnostics (Basel),2024,2024/03/13,PMC10930993,,10.3390/diagnostics14050530,The progress of artificial intelligence algori...,Advancements in Glaucoma Diagnosis: The Role o...,True,computer vision,image processing
112,38155727,An Image Processing Algorithm for Facile and R...,"Senthil N, Pacifici N, Cruz-Acuña M, Diener A,...",Chem Biomed Imaging. 2023 Nov 20;1(9):831-842....,Senthil N,Chem Biomed Imaging,2023,2023/12/29,PMC10751783,,10.1021/cbmi.3c00102,Vomocytosis is a process that occurs when inte...,An Image Processing Algorithm for Facile and R...,True,computer vision,Image Processing
234,36143468,Artificial Intelligence in Biological Sciences,"Bhardwaj A, Kishore S, Pandey DK.",Life (Basel). 2022 Sep 14;12(9):1430. doi: 10....,Bhardwaj A,Life (Basel),2022,2022/09/23,PMC9505413,,10.3390/life12091430,"Artificial intelligence (AI), currently a cutt...",Artificial Intelligence in Biological Sciences...,True,computer vision,image processing
305,34971977,Greedy Autoaugment for classification of mycob...,"Momeny M, Neshat AA, Gholizadeh A, Jafarnezhad...",Comput Biol Med. 2022 Feb;141:105175. doi: 10....,Momeny M,Comput Biol Med,2022,2021/12/31,,,10.1016/j.compbiomed.2021.105175,Although tuberculosis (TB) is a disease whose ...,Greedy Autoaugment for classification of mycob...,True,other,"convolutional neural network, CNN"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11375,34945937,Impulsive Reaction-Diffusion Delayed Models in...,"Stamov G, Stamova I, Spirova C.",Entropy (Basel). 2021 Dec 3;23(12):1631. doi: ...,Stamov G,Entropy (Basel),2021,2021/12/24,PMC8700440,,10.3390/e23121631,In this paper we study an impulsive delayed re...,Impulsive Reaction-Diffusion Delayed Models in...,True,other,diffusion model
11383,33529229,Megacities as drivers of national outbreaks: T...,"Mahmud AS, Kabir MI, Engø-Monsen K, Tahmina S,...",PLoS Negl Trop Dis. 2021 Feb 2;15(2):e0009106....,Mahmud AS,PLoS Negl Trop Dis,2021,2021/02/02,PMC7880496,,10.1371/journal.pntd.0009106,BACKGROUND: Several large outbreaks of chikung...,Megacities as drivers of national outbreaks: T...,True,other,diffusion model
11415,25540245,The diffusion of docetaxel in patients with me...,"Unger JM, Hershman DL, Martin D, Etzioni RB, B...",J Natl Cancer Inst. 2014 Dec 24;107(2):dju412....,Unger JM,J Natl Cancer Inst,2014,2014/12/26,PMC4326312,,10.1093/jnci/dju412,BACKGROUND: Diffusion of new cancer treatments...,The diffusion of docetaxel in patients with me...,True,other,diffusion model
11416,25451529,Network impact on persistence in a finite popu...,"Barbillon P, Thomas M, Goldringer I, Hospital ...",J Theor Biol. 2015 Jan 21;365:365-76. doi: 10....,Barbillon P,J Theor Biol,2015,2014/12/03,,,10.1016/j.jtbi.2014.10.032,Dynamic extinction colonisation models (also c...,Network impact on persistence in a finite popu...,True,other,diffusion model
