In [1]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text


In [2]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def summarize_text(text, model_name='t5-small', max_length=150, min_length=40):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    # Prepare the text for summarization
    inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True)
    
    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=2
    )
    
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def split_text_into_chunks(text, max_chunk_size=500):
    sentences = text.split('. ')
    current_chunk = ''
    chunks = []
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + '. '
    if current_chunk:
        chunks.append(current_chunk)
    return chunks


In [5]:
def summarize_large_text(text, model_name='t5-small'):
    chunks = split_text_into_chunks(text)
    summaries = []
    for chunk in chunks:
        summary = summarize_text(chunk, model_name=model_name)
        summaries.append(summary)
    final_summary = ' '.join(summaries)
    return final_summary


In [7]:
import PyPDF2
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

def summarize_text(text, model, tokenizer, max_length=130, min_length=65):
    """Summarizes a given text using a pre-loaded model and tokenizer."""
    # Prepare the text for summarization
    inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True)
    
    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=2
    )
    
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

def split_text_into_chunks(text, max_chunk_size=1000):
    """Splits text into smaller chunks to fit model input size."""
    sentences = text.replace('\n', ' ').split('. ')
    chunks = []
    current_chunk = ''
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + '. '
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

def summarize_large_text(text, model, tokenizer):
    """Summarizes large text by splitting it into chunks and summarizing the combined summaries."""
    chunks = split_text_into_chunks(text)
    summaries = []
    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i+1}/{len(chunks)}...")
        # Summarize each chunk into very short summaries
        summary = summarize_text(chunk, model, tokenizer, max_length=50, min_length=30)
        summaries.append(summary)
    combined_summary = ' '.join(summaries)
    # Summarize the combined summaries to get the final summary
    final_summary = summarize_text(combined_summary, model, tokenizer, max_length=130, min_length=65)
    return final_summary

# Load the tokenizer and model once to avoid reloading for each summary
model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# List of PDF file paths
pdf_paths = [
    r"E:\OSPO\bioarxiv\2024.03.01.582919v1.full.pdf",
    r"E:\OSPO\eartharxiv\jones_et_al_2024.pdf",
    r"E:\OSPO\eartharxiv\jones_et_al_2022.pdf",
    r"E:\OSPO\eartharxiv\deeptime-preprint.pdf",
    r"E:\OSPO\evoarxiv\copy-of-pcm-sensitivity-manuscript.pdf"
]

# Process each PDF file
for pdf_path in pdf_paths:
    print(f"\nProcessing file: {pdf_path}")
    text = extract_text_from_pdf(pdf_path)
    if text:
        summary = summarize_large_text(text, model, tokenizer)
        print("\nSummary:")
        print(summary)
    else:
        print("No text found in the PDF.")



Processing file: E:\OSPO\bioarxiv\2024.03.01.582919v1.full.pdf
Summarizing chunk 1/112...
Summarizing chunk 2/112...
Summarizing chunk 3/112...
Summarizing chunk 4/112...
Summarizing chunk 5/112...
Summarizing chunk 6/112...
Summarizing chunk 7/112...
Summarizing chunk 8/112...
Summarizing chunk 9/112...
Summarizing chunk 10/112...
Summarizing chunk 11/112...
Summarizing chunk 12/112...
Summarizing chunk 13/112...
Summarizing chunk 14/112...
Summarizing chunk 15/112...
Summarizing chunk 16/112...
Summarizing chunk 17/112...
Summarizing chunk 18/112...
Summarizing chunk 19/112...
Summarizing chunk 20/112...
Summarizing chunk 21/112...
Summarizing chunk 22/112...
Summarizing chunk 23/112...
Summarizing chunk 24/112...
Summarizing chunk 25/112...
Summarizing chunk 26/112...
Summarizing chunk 27/112...
Summarizing chunk 28/112...
Summarizing chunk 29/112...
Summarizing chunk 30/112...
Summarizing chunk 31/112...
Summarizing chunk 32/112...
Summarizing chunk 33/112...
Summarizing chunk 34/