In [None]:
from transformers import pipeline
import pdfplumber

# Load a pre-trained open-source model
# Replace with your choice of free LLM (e.g., 'EleutherAI/gpt-j-6B' or 'meta-llama/Llama-2-7b-chat-hf')
summarizer = pipeline("text2text-generation", model="google/flan-t5-base")

# Function to extract text and process in batches
def process_pdf_with_free_llm(pdf_path, batch_size=2):
    results = []
    with pdfplumber.open(pdf_path) as pdf:
        text_batches = [
            "\n".join([pdf.pages[i].extract_text() for i in range(start, min(start + batch_size, len(pdf.pages)))])
            for start in range(0, len(pdf.pages), batch_size)
        ]

    for idx, batch in enumerate(text_batches):
        # Prompt for the open-source model
        prompt = f"""
        Extract and organize the text by speaker. 
        Summarize each speaker's points, and analyze the overall topic. 
        Input:
        {batch[::-1]}
        """
        result = summarizer(prompt, max_length=1024, truncation=True)[0]["generated_text"]
        results.append({"batch": idx + 1, "result": result})
    return results

# Process the uploaded PDF
pdf_path = "downloads/25_ptv_5176143.pdf"
output = process_pdf_with_free_llm(pdf_path)

# Save or display the output
for item in output:
    print(f"Batch {item['batch']}:\n{item['result']}\n")


In [35]:
output

[{'batch': 1,
  'result': '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               '},
 {'batch': 2,
  'result': '                                                                                                                                                                                                                                                                                                                                                                                                                                               