In [None]:
import openai
from key import api_key

# Ensure you've set up your API keys properly
openai.api_key = api_key

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(file_path):
    pdf_document = fitz.open(file_path)
    text = ""
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)  # Corrected this line
        text += page.get_text()
    pdf_document.close()
    return text

# Example usage:
file_path = "lectures\\04_baumbasierteVerfahren.pdf"
pdf_text = extract_text_from_pdf(file_path)
print(pdf_text)

In [None]:
from transformers import GPT2Tokenizer

def break_into_sections(text, max_tokens_per_section):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    words = text.split()
    sections = []
    section = ""
    token_count = 0

    for word in words:
        tokens = tokenizer.tokenize(word)
        if token_count + len(tokens) < max_tokens_per_section:
            section += " " + word
            token_count += len(tokens)
        else:
            sections.append(section.strip())
            section = word
            token_count = len(tokens)
    
    # Add the last section if it's non-empty
    if section:
        sections.append(section)
    
    return sections

# Example usage:
sections = break_into_sections(pdf_text, max_tokens_per_section=1700)

In [None]:
import time

def summarize_text(text):
    sections = break_into_sections(text, max_tokens_per_section=1700)
    summaries = []
    
    for section in sections:
        summary = ""
        while not summary:
            try:
                response = openai.Completion.create(
                    engine="gpt-3.5-turbo-instruct",
                    prompt=f"{section} \n\n This part can be summarized as follows:",
                    max_tokens=200
                )
                summary = response['choices'][0]['text'].strip()
                summaries.append(summary)
            except openai.error.OpenAIError as e:
                if 'Rate limit reached' in str(e):
                    print('Rate limit exceeded. Waiting for 60 seconds...')
                    time.sleep(30)  # Wait for 60 seconds before retrying
                else:
                    print(f'An error occurred: {e}')
                    break  # Break out of the loop for other errors
        print(f'Summarized {len(summaries)} out of {len(sections)} sections')
        print(f'Current summary: {summary}')
    
    return " ".join(summaries)

final_summary = summarize_text(pdf_text)
print(final_summary)


In [None]:

def summarize_final(summary):
    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=f"{summary} \n \n Overall one can summarize the text in the following 10 bullets each having subpoints to elaborate further:",
        max_tokens=600
    )
    print(response["choices"])
    return response['choices'][0]['text'].strip()

In [None]:
bullets = summarize_final(final_summary)
print(bullets)

In [None]:


def summarize_text(text):
    response = openai.Completion.create(
        engine="davinci",
        prompt=f"Summarize the following text:\n{text}",
        max_tokens=300  # Limit the summary length
    )
    summary = response['choices'][0]['text'].strip()
    return summary

# Example usage:
summary = summarize_text(pdf_text)
print(summary)