In [None]:
import sys
import os

sys.path.append(os.path.abspath(".."))

import importlib

from functions.pubmed_api import pubmed_api_pull
from functions.text_mining import check_if_text_has_outbreak
import spacy



In [18]:
def extract_entities(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = {'disease': [], 'location': [], 'date': []}
    for ent in doc.ents:
        if ent.label_ == "GPE":  # Ort (Geopolitische Entität)
            entities['location'].append(ent.text)
        elif ent.label_ == "DATE":  # Datum
            entities['date'].append(ent.text)
        # Du kannst hier auch nach spezifischen Krankheiten suchen
        elif 'avian influenza' in ent.text.lower() or 'ebola' in ent.text.lower():  # Beispiel für Krankheitsnamen
            entities['disease'].append(ent.text)
    return entities

def extract_location(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
    return ", ".join(locations) if locations else None

def extract_date(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    return ", ".join(dates) if dates else None



In [22]:
t = ["avian influenza outbreak"]
n = 10

df = pubmed_api_pull(t, n)
df

Unnamed: 0,pmid,title,abstract,full_text,authors
0,40192841,Immunogenicity and protective efficacy of an i...,The wild-type H1N1 and H3N2 swine influenza vi...,Introduction Swine influenza (SI) is a highly ...,http://orcid.org/0000-0002-3757-6762 Zhang Hen...
1,34451412,Epidemiological characteristics of human psitt...,BackgroundPsittacosis is a global and underapp...,"1 Introduction Psittacosis, commonly referred ...","Wen Yunjing, Zhang Wei, Li Yongguang, Liao Xin..."
2,40185841,Design and validation of a semi-quantitative m...,"Since 2001, human Metapneumovirus has been a s...",Introduction Human Metapneumovirus from discov...,"Riolo Giulia giulia.riolo@gmail.com 1, Biagini..."
3,31609197,Lentogenic avian paramyxovirus 1 of both class...,"Newcastle disease, a notifiable avian disease,...",Introduction Newcastle disease is a viral dise...,"Martiny Karen karma@sund.ku.dk 1, Liang Yuan 1..."
4,40184370,Development of an experimental model using col...,"Since 2016, low pathogenic avian influenza vir...",Introduction Low Pathogenic Avian Influenza Vi...,https://orcid.org/0009-0003-6726-000X Arbani O...
5,40190733,Machine learning methods for predicting human-...,IntroductionIt is not clear about mechanisms u...,Highlights There was a correlation between the...,"Zeng Dan-Dan 1 2 †, Cai Yu-Rong 2 †, Zhang Sen..."
6,39999793,Challenges and constraints to the sustainabili...,The poultry farming industry in Thailand plays...,INTRODUCTION The world is currently facing cha...,https://orcid.org/0000-0002-1170-7031 Wongtang...
7,39999794,Challenges and constraints in the sustainabili...,Poultry products such as chicken meat and eggs...,INTRODUCTION Chicken meat and eggs are among t...,https://orcid.org/0009-0001-0562-9145 Ohtsu Ha...
8,40178199,Detection of a Reassortant Swine‐ and Human‐Or...,"ABSTRACTIntroductionIn December 2021, influenz...",1 Introduction Influenza A viruses (IAVs) infe...,Kuchinski Kevin S. https://orcid.org/0000-0001...
9,40176109,Neutralizing monoclonal antibodies as effectiv...,The H10 subtype of avian influenza virus (AIV)...,Introduction Avian influenza viruses (AIVs) ha...,"Wang Ping 1, Fu Jiamin 1, Cheng Linfang 1, Yan..."


In [23]:
df["has_outbreak"] = df["full_text"].apply(check_if_text_has_outbreak)
df

Unnamed: 0,pmid,title,abstract,full_text,authors,has_outbreak
0,40192841,Immunogenicity and protective efficacy of an i...,The wild-type H1N1 and H3N2 swine influenza vi...,Introduction Swine influenza (SI) is a highly ...,http://orcid.org/0000-0002-3757-6762 Zhang Hen...,False
1,34451412,Epidemiological characteristics of human psitt...,BackgroundPsittacosis is a global and underapp...,"1 Introduction Psittacosis, commonly referred ...","Wen Yunjing, Zhang Wei, Li Yongguang, Liao Xin...",True\n
2,40185841,Design and validation of a semi-quantitative m...,"Since 2001, human Metapneumovirus has been a s...",Introduction Human Metapneumovirus from discov...,"Riolo Giulia giulia.riolo@gmail.com 1, Biagini...",False
3,31609197,Lentogenic avian paramyxovirus 1 of both class...,"Newcastle disease, a notifiable avian disease,...",Introduction Newcastle disease is a viral dise...,"Martiny Karen karma@sund.ku.dk 1, Liang Yuan 1...",False\n
4,40184370,Development of an experimental model using col...,"Since 2016, low pathogenic avian influenza vir...",Introduction Low Pathogenic Avian Influenza Vi...,https://orcid.org/0009-0003-6726-000X Arbani O...,True
5,40190733,Machine learning methods for predicting human-...,IntroductionIt is not clear about mechanisms u...,Highlights There was a correlation between the...,"Zeng Dan-Dan 1 2 †, Cai Yu-Rong 2 †, Zhang Sen...",False\n
6,39999793,Challenges and constraints to the sustainabili...,The poultry farming industry in Thailand plays...,INTRODUCTION The world is currently facing cha...,https://orcid.org/0000-0002-1170-7031 Wongtang...,False\n
7,39999794,Challenges and constraints in the sustainabili...,Poultry products such as chicken meat and eggs...,INTRODUCTION Chicken meat and eggs are among t...,https://orcid.org/0009-0001-0562-9145 Ohtsu Ha...,False
8,40178199,Detection of a Reassortant Swine‐ and Human‐Or...,"ABSTRACTIntroductionIn December 2021, influenz...",1 Introduction Influenza A viruses (IAVs) infe...,Kuchinski Kevin S. https://orcid.org/0000-0001...,False
9,40176109,Neutralizing monoclonal antibodies as effectiv...,The H10 subtype of avian influenza virus (AIV)...,Introduction Avian influenza viruses (AIVs) ha...,"Wang Ping 1, Fu Jiamin 1, Cheng Linfang 1, Yan...",False


In [25]:
filtered_df = df.loc[(df.has_outbreak == "True\n") | (df.has_outbreak == "True")].copy()
filtered_df

Unnamed: 0,pmid,title,abstract,full_text,authors,has_outbreak
1,34451412,Epidemiological characteristics of human psitt...,BackgroundPsittacosis is a global and underapp...,"1 Introduction Psittacosis, commonly referred ...","Wen Yunjing, Zhang Wei, Li Yongguang, Liao Xin...",True\n
4,40184370,Development of an experimental model using col...,"Since 2016, low pathogenic avian influenza vir...",Introduction Low Pathogenic Avian Influenza Vi...,https://orcid.org/0009-0003-6726-000X Arbani O...,True


In [26]:
filtered_df["place"] = filtered_df["full_text"].apply(extract_location)
filtered_df["time"] = filtered_df["full_text"].apply(extract_date)
filtered_df

Unnamed: 0,pmid,title,abstract,full_text,authors,has_outbreak,place,time
1,34451412,Epidemiological characteristics of human psitt...,BackgroundPsittacosis is a global and underapp...,"1 Introduction Psittacosis, commonly referred ...","Wen Yunjing, Zhang Wei, Li Yongguang, Liao Xin...",True\n,"China, Zhejiang province, Shandong province, C...","months, recent years, 2021, between January 1,..."
4,40184370,Development of an experimental model using col...,"Since 2016, low pathogenic avian influenza vir...",Introduction Low Pathogenic Avian Influenza Vi...,https://orcid.org/0009-0003-6726-000X Arbani O...,True,"Wisconsin, USA, Asia, Africa, the Middle East,...","1966, 2016, 2016, 2022, 2015, 36 days of age, ..."
