In [None]:
%pip install biopython --upgrade
%pip install requests_html
%pip install pdfplumber

In [None]:
from Bio import Entrez
from requests_html import HTMLSession
import requests
from requests.exceptions import ConnectionError


def search_pmc(search_term, max_results=10):
    Entrez.email = "savicisplaying@gmail.com"  # Set your email for NCBI API access

    # Use the esearch function to search for articles in PMC
    handle = Entrez.esearch(db="pmc", term=search_term, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    return record["IdList"]

def main():
    search_term = input("Enter the search term for PMC: ")
    max_results = int(input("Enter the maximum number of results to fetch: "))

    pmc_ids = search_pmc(search_term, max_results)

    s = HTMLSession()

    for pmc in pmc_ids:
      try:
        pmcid = pmc.strip()
        base_url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC'
        r = s.get(base_url + pmcid + '/', timeout=3)
        pdf_url = 'https://www.ncbi.nlm.nih.gov/' + r.html.find('a.int-view', first=True).attrs['href']
        r=s.get(pdf_url, stream=True)
        with open(pmcid + '.pdf', 'wb') as f:
          for chunk in r.iter_content(chunk_size=1024):
            if chunk:
              f.write(chunk)


      except ConnectionError as e:
        pass


if __name__ == "__main__":
    main()


In [None]:
import pdfplumber
from transformers import BartForConditionalGeneration, BartTokenizer
import re
import os

# Convert PDF to Text
def pdf_to_text(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            # Extract text from the PDF page
            page_text = page.extract_text()

            # Remove image tags from the text
            page_text = re.sub(r'\<\/?[img|IMG|Image]\>|\[[A-Za-z]+\]', '', page_text)

            text += page_text
    return text

# Preprocess the Text Data
def preprocess(text):
    # Remove unnecessary characters and formatting
    text = re.sub(r'[\n\r\t]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# Load the Pretrained Summarization Model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Summarize the Text Data
def summarize(text, min_words=100, max_words=150):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Adjust the summary length
    summary_words = summary.split()
    if len(summary_words) < min_words:
        return summary
    elif len(summary_words) > max_words:
        return ' '.join(summary_words[:max_words])
    else:
        return summary

# Usage
pdf_dir = "/content"
summary_dir = "/content/summary"
combined_summary_file = "combined_summaries.txt"

# Iterate over PDF files and generate summaries
combined_summaries = []
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        text = pdf_to_text(pdf_path)
        preprocessed_text = preprocess(text)
        summary = summarize(preprocessed_text, min_words=100, max_words=150)

        # Get the PubMed ID from the file name
        pubmed_id = os.path.splitext(pdf_file)[0]

        # Add the summary to the combined list with its PubMed ID as the heading
        combined_summaries.append(f"PubMed ID: {pubmed_id}\n\n{summary}\n\n")

# Save the combined summaries to a text file
with open(combined_summary_file, "w", encoding="utf-8") as f:
    f.write("\n".join(combined_summaries))

print(f"Combined summaries saved: {combined_summary_file}")