In [1]:
import pandas as pd
import nltk
from sentence_transformers import SentenceTransformer, util
import re

# Download NLTK tokenizer for sentence splitting
nltk.download("punkt")

# Load the dataset
df = pd.read_csv('collection_with_abstracts.csv')  # Update with the actual path
df['Combined_Text'] = df['Title'].fillna('') + " " + df['Abstract'].fillna('')

# Initialize a pre-trained transformer model (e.g., BERT or MiniLM)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Expanded list of target phrases for initial filtering
target_sentences = [
    # Neural networks and related terms
    "Usage of neural networks in virology",
    "Usage of neural networks in epidemiology",
    "Application of artificial neural networks in virology",
    "Application of artificial neural networks in epidemiology",
    "Use of machine learning models in virology",
    "Use of machine learning models in epidemiology",
    "Use of feedforward neural networks in virology",
    "Use of feedforward neural networks in epidemiology",
    "Application of neural net algorithms in virology",
    "Application of neural net algorithms in epidemiology",
    "Usage of multilayer perceptrons in virology",
    "Usage of multilayer perceptrons in epidemiology",
    "Use of convolutional neural networks (CNN) in virology",
    "Use of convolutional neural networks (CNN) in epidemiology",
    "Application of recurrent neural networks (RNN) in virology",
    "Application of recurrent neural networks (RNN) in epidemiology",
    "Use of long short-term memory (LSTM) networks in virology",
    "Use of long short-term memory (LSTM) networks in epidemiology",
    "GRNN applications in virology",
    "GRNN applications in epidemiology",
    # Deep learning
    "Deep learning models in virology research",
    "Deep learning models in epidemiology research",
    "Applications of deep neural networks in virology",
    "Applications of deep neural networks in epidemiology",
    # Computer vision and related terms
    "Computer vision techniques in virology",
    "Computer vision techniques in epidemiology",
    "Usage of vision models in virology",
    "Usage of vision models in epidemiology",
    "Application of image processing in virology",
    "Application of image processing in epidemiology",
    "Vision algorithms for virology research",
    "Vision algorithms for epidemiology research",
    "Computer graphics and vision in virology",
    "Computer graphics and vision in epidemiology",
    "Object recognition in virology",
    "Object recognition in epidemiology",
    "Scene understanding in virology",
    "Scene understanding in epidemiology",
    # Natural language processing and related terms
    "Natural language processing in virology",
    "Natural language processing in epidemiology",
    "Application of text mining in virology",
    "Application of text mining in epidemiology",
    "NLP for virology research",
    "NLP for epidemiology research",
    "Computational linguistics in virology",
    "Computational linguistics in epidemiology",
    "Language processing in virology",
    "Language processing in epidemiology",
    "Text analytics in virology",
    "Text analytics in epidemiology",
    "Textual data analysis in virology",
    "Textual data analysis in epidemiology",
    "Speech and language technology for virology",
    "Speech and language technology for epidemiology",
    "Language modeling in virology research",
    "Language modeling in epidemiology research",
    "Computational semantics in virology",
    "Computational semantics in epidemiology",
    # Generative AI
    "Generative AI applications in virology",
    "Generative AI applications in epidemiology",
    "Generative deep learning in virology",
    "Generative deep learning in epidemiology",
    "Usage of generative models in virology",
    "Usage of generative models in epidemiology",
    # Transformer models and related terms
    "Transformer models for virology research",
    "Transformer models in epidemiology",
    "Self-attention models for virology",
    "Self-attention models in epidemiology",
    "Transformer architecture in virology",
    "Transformer architecture in epidemiology",
    "Attention-based neural networks in virology",
    "Attention-based neural networks in epidemiology",
    "Transformer networks for virology research",
    "Transformer networks in epidemiology",
    "Sequence-to-sequence models in virology",
    "Sequence-to-sequence models in epidemiology",
    # Large language models and related terms
    "Large language models applied to virology",
    "Large language models applied to epidemiology",
    "Application of LLMs in virology",
    "Application of LLMs in epidemiology",
    "Transformer-based models for virology research",
    "Transformer-based models in epidemiology",
    "Usage of pretrained language models in virology",
    "Usage of pretrained language models in epidemiology",
    "Generative language models for virology",
    "Generative language models for epidemiology",
    "Foundation models in virology research",
    "Foundation models in epidemiology research",
    "State-of-the-art language models in virology",
    "State-of-the-art language models in epidemiology",
    # Multimodal models and related terms
    "Multimodal models in virology",
    "Multimodal models in epidemiology",
    "Multimodal neural networks in virology",
    "Multimodal neural networks in epidemiology",
    "Vision transformers in virology",
    "Vision transformers in epidemiology",
    "Diffusion models in virology",
    "Diffusion models in epidemiology",
    "Generative diffusion models in virology",
    "Generative diffusion models in epidemiology",
    "Diffusion-based generative models in virology research",
    "Diffusion-based generative models in epidemiology research",
    "Continuous diffusion models in virology",
    "Continuous diffusion models in epidemiology",
    # Additional sentances
    "Application of deep learning in virology",
    "Use of neural networks in virology research",
    "Deep neural networks for epidemiology",
    "Deep learning approaches in virology studies",
    "Machine learning in epidemiology",
    "Artificial intelligence for analyzing virology data",
    "Neural networks applied to virology and epidemiology",
    "Use of CNNs, RNNs, LSTMs in virology",
    "Use of deep learning models in epidemiology"
]

# Encode target sentences (e.g., topics related to virology, epidemiology, etc.)
target_embeddings = model.encode(target_sentences, convert_to_tensor=True)

# Define the class descriptions for classification
class_descriptions = {
    "computer vision": "Techniques and methods related to image processing, computer vision, object recognition, scene understanding, vision models, etc.",
    "text mining": "Methods related to natural language processing (NLP), text mining, computational linguistics, text analytics, language modeling, etc.",
    "both": "Methods that combine deep learning approaches and neural networks (e.g., CNN, LSTM, transformers) for both text and image processing.",
    "other": "Generative models, multimodal models, and other advanced techniques in AI, such as diffusion models and generative models."
}

# Encode the class descriptions into embeddings
class_embeddings = {key: model.encode(desc, convert_to_tensor=True) for key, desc in class_descriptions.items()}

# Initial filtering based on semantic similarity with target sentences
relevant_rows = []
similarity_threshold = 0.5  # Adjust this threshold as needed

for idx, row in df.iterrows():
    combined_text = row['Combined_Text']
    max_similarity = 0  # Initialize max similarity for this document
    
    # Split text into sentences and calculate similarity for each sentence
    sentences = nltk.sent_tokenize(combined_text)
    for sentence in sentences:
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)
        similarities = util.pytorch_cos_sim(sentence_embedding, target_embeddings)
        max_similarity = max(max_similarity, similarities.max().item())
    
    # Retain row if max similarity score meets threshold
    if max_similarity >= similarity_threshold:
        relevant_rows.append(row)

# Convert the list of relevant rows back into a DataFrame
relevant_df = pd.DataFrame(relevant_rows)

# Function to classify method based on semantic similarity
def classify_method(text):
    # Encode the input text
    text_embedding = model.encode(text, convert_to_tensor=True)
    
    # Calculate cosine similarity between the input text and each class description
    similarities = {key: util.cos_sim(text_embedding, emb)[0][0].item() for key, emb in class_embeddings.items()}
    
    # Get the class with the highest similarity
    most_similar_class = max(similarities, key=similarities.get)
    
    return most_similar_class

# Apply the new classification logic to the filtered DataFrame
relevant_df['Method Type'] = relevant_df['Combined_Text'].apply(classify_method)

# Example of extracting specific methods using a set of keywords
method_keywords = r"(neural network|artificial neural network|machine learning model|feedforward neural network|neural net algorithm|multilayer perceptron|convolutional neural network|recurrent neural network|long short-term memory network|CNN|GRNN|RNN|LSTM|deep learning|deep neural networks|computer vision|vision model|image processing|vision algorithms|computer graphics and vision|object recognition|scene understanding|natural language processing|text mining|NLP|computational linguistics|language processing|text analytics|textual data analysis|text data analysis|text analysis|speech and language technology|language modeling|computational semantics|generative artificial intelligence|generative AI|generative deep learning|generative models|transformer models|self-attention models|transformer architecture|transformer|attention-based neural networks|transformer networks|sequence-to-sequence models|large language model|LLM|transformer-based model|pretrained language model|generative language model|foundation model|state-of-the-art language model|multimodal model|multimodal neural network|vision transformer|diffusion model|generative diffusion model|diffusion-based generative model|continuous diffusion model)"

# Extract the specific methods used in each paper
relevant_df['Extracted Method'] = relevant_df['Combined_Text'].apply(
    lambda x: ", ".join(set(re.findall(method_keywords, x, re.IGNORECASE))) if isinstance(x, str) else ""
)

# Save the filtered entries to an Excel file for review
relevant_df.to_excel('filtered_virology_epidemiology_papers_with_semantic_classification.xlsx', index=False)

# Display the relevant DataFrame
relevant_df


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to C:\Users\arl-
[nltk_data]     sl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Abstract,Combined_Text,Method Type,Extracted Method
3,39367648,An initial game-theoretic assessment of enhanc...,"Fatemi MY, Lu Y, Diallo AB, Srinivasan G, Azhe...",Brief Bioinform. 2024 Sep 23;25(6):bbae476. do...,Fatemi MY,Brief Bioinform,2024,2024/10/05,PMC11452536,,10.1093/bib/bbae476,The application of deep learning to spatial tr...,An initial game-theoretic assessment of enhanc...,both,"deep learning, neural network"
28,39023647,Evaluation of eyelid features' changes before ...,"Acar Eser N, Serbest Ceylanoglu K, Malkoc Sen E.",Int Ophthalmol. 2024 Jul 18;44(1):328. doi: 10...,Acar Eser N,Int Ophthalmol,2024,2024/07/18,,,10.1007/s10792-024-03246-y,PURPOSE: To evaluate the eyelid features' chan...,Evaluation of eyelid features' changes before ...,computer vision,computer vision
30,39013794,Deep Learning - Methods to Amplify Epidemiolog...,"Alex Quistberg D, Mooney SJ, Tasdizen T, Arbel...",Am J Epidemiol. 2024 Jul 16:kwae215. doi: 10.1...,Alex Quistberg D,Am J Epidemiol,2024,2024/07/16,,,10.1093/aje/kwae215,Deep learning is a subfield of artificial inte...,Deep Learning - Methods to Amplify Epidemiolog...,both,"deep learning, Deep learning, Deep Learning, n..."
31,38996550,MEFFGRN: Matrix enhancement and feature fusion...,"Wei PJ, Bao JJ, Gao Z, Tan JY, Cao RF, Su Y, Z...",Comput Biol Med. 2024 Sep;179:108835. doi: 10....,Wei PJ,Comput Biol Med,2024,2024/07/12,,,10.1016/j.compbiomed.2024.108835,Gene regulatory networks (GRNs) are crucial fo...,MEFFGRN: Matrix enhancement and feature fusion...,both,"Convolutional Neural Network, image processing..."
42,38906615,Automated cooling tower detection through deep...,"Wong KK, Segura T, Mein G, Lu J, Hannapel EJ, ...",Lancet Digit Health. 2024 Jul;6(7):e500-e506. ...,Wong KK,Lancet Digit Health,2024,2024/06/21,,,10.1016/S2589-7500(24)00094-3,BACKGROUND: Cooling towers containing Legionel...,Automated cooling tower detection through deep...,both,"deep learning, computer vision"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11439,16289268,On the dynamics of dengue epidemics from large...,"Tran A, Raffy M.",Theor Popul Biol. 2006 Feb;69(1):3-12. doi: 10...,Tran A,Theor Popul Biol,2006,2005/11/18,,,10.1016/j.tpb.2005.06.008,A model for the spatial and temporal dynamics ...,On the dynamics of dengue epidemics from large...,other,diffusion model
11444,11177527,Diffusion theory and drug use,Ferrence R.,Addiction. 2001 Jan;96(1):165-73. doi: 10.1046...,Ferrence R,Addiction,2001,2001/02/15,,,10.1046/j.1360-0443.2001.96116512.x,This paper examines the applicability of the d...,Diffusion theory and drug use This paper exami...,other,diffusion model
11445,10607521,The characteristics of epidemics and invasions...,"Cruickshank I, Gurney WS, Veitch AR.",Theor Popul Biol. 1999 Dec;56(3):279-92. doi: ...,Cruickshank I,Theor Popul Biol,1999,1999/12/23,,,10.1006/tpbi.1999.1432,In this paper we report the development of a h...,The characteristics of epidemics and invasions...,other,diffusion model
11446,10072741,Effects of sales promotion on smoking among U....,Redmond WH.,Prev Med. 1999 Mar;28(3):243-50. doi: 10.1006/...,Redmond WH,Prev Med,1999,1999/03/12,,,10.1006/pmed.1998.0410,OBJECTIVE: The purpose of this study was to ex...,Effects of sales promotion on smoking among U....,text mining,"Diffusion model, diffusion model"
