# Academic Article Summarization - Data Gathering.

The data was collected from The [National Center for Biotechnology Information](https://www.ncbi.nlm.nih.gov/) - (NCBI) and under their compliances and policy. NCBI holds an open access database for academic articles, all on topics of science, health and medicine. In total were gather by me 87 different articles, but in order to make my work more focused, i've collected aricles that contained any of the keywords relevant to my field, which are: NLP, Machine Learning, Deep Learning, AI and Neural Netword.

Below presented the code that responsiable for fetching the articles and saving them localy on my machine. In order not to overly used the generousity of the NCBI's API for data gathering, and to comply with their request for its use, ive exported the script away from my main NLP model research, in order to overload their server with requests.

Thia means, that ive gather the articles from NCBI, but al farther access and text manipulation were done without API calls to their server.

And at this point i want to thank to NCBI for allowing both reaserchers and developers to access the data in for our perpouses.

In [None]:
!pip install biopython

In [None]:
mount = '/content/drive'
from google.colab import drive
drive.mount(mount)

In [None]:
data_path = "/content/drive/MyDrive/Colab Notebooks/NLP/PMC_Centralized_data"

In [None]:
import os
import time
import re
from Bio import Entrez
import xml.etree.ElementTree as ET

# Configure email for NCBI access
# Feel free to contact me!
Entrez.email = "ramdvlp@gmail.com"

def sanitize_filename(text):
    """ Clean and truncate the text to create a valid filename. """
    text = re.sub(r'[\\/:"*?<>|]+', "", text)
    return text[:50]  # Truncate filename if too long

def parse_article(xml_content):
    """ Parse XML content and extract title, abstract, and body text for each article. """
    root = ET.fromstring(xml_content)
    articles = []

    for article in root.findall('.//article'):
        title_elem = article.find('.//article-title')
        if title_elem is None or title_elem.text is None:
            continue  # Skip article if title is missing

        abstract_elem = article.find('.//abstract')
        if abstract_elem is None:
            continue  # Skip article if abstract is missing
        abstract = "\n".join((p.text if p.text else '') for p in abstract_elem.findall('.//p'))
        if not abstract:
            continue  # Skip article if abstract text is missing

        body_elems = article.findall('.//body//p')
        body_text = "\n".join((p.text if p.text else '') for p in body_elems)
        if not body_text:
            continue  # Skip article if body text is missing

        articles.append((title_elem.text, abstract, body_text))

    return articles

def fetch_and_save_articles(keywords, folder, max_count=100):
    """ Search articles containing any of the specified keywords in their title or abstract, and save them without duplicates. """
    query = f"({' OR '.join([f'{kw}[Title/Abstract]' for kw in keywords])}) AND open access[filter]"
    handle = Entrez.esearch(db="pmc", term=query, retmax=max_count)
    record = Entrez.read(handle)
    handle.close()

    id_list = record["IdList"]
    fetched_ids = set()  # To track already fetched articles
    if id_list:
        if not os.path.exists(folder):
            os.makedirs(folder)

        for article_id in id_list:
            if article_id in fetched_ids:
                continue  # Skip if already fetched
            fetched_ids.add(article_id)

            handle = Entrez.efetch(db="pmc", id=article_id, rettype="medline", retmode="xml")
            article_xml = handle.read()
            handle.close()

            articles = parse_article(article_xml.decode('utf-8'))
            for title, abstract, body_text in articles:
                filename = sanitize_filename(title) + ".txt"
                filepath = os.path.join(folder, filename)
                with open(filepath, "w") as file:
                    file.write("Title: " + title + "\n\nAbstract:\n" + abstract + "\n\nBody:\n" + body_text)

            time.sleep(1)  # Sleep to ensure compliance with API rate limits

# Keywords and folder configuration
keywords = ["machine learning", "deep learning", "NLP", "artificial intelligence", "neural networks"]
save_folder = "/content/drive/MyDrive/Colab Notebooks/NLP/PMC_Centralized_data"

# Fetch and save articles
print("Fetching and saving articles...")
fetch_and_save_articles(keywords, save_folder)
print("All articles fetched and saved successfully.")