In [None]:
!pip install pypdf
!pip install gpt4all

# Textual Data Curation

In [None]:
import requests
import pypdf
from io import BytesIO
from gpt4all import GPT4All

In [None]:
def download_pdf_from_url(pdf_url):
    response = requests.get(pdf_url)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception(f"Failed to download the PDF: {response.status_code}")

def extract_text_from_pdf(pdf_content):
    reader = pypdf.PdfReader(BytesIO(pdf_content))
    text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text() + "\n"
    return text

def clean_text(text):
    # Basic cleaning: remove extra spaces and newlines
    cleaned_text = text.replace('\n', ' ').replace('\r', '').strip()
    # You can also add rules to remove unwanted headers, footers, or metadata
    return cleaned_text

pdf_url = "https://www.itu.int/en/ITU-T/focusgroups/an/Documents/Use-case-AN.pdf"
pdf_content = download_pdf_from_url(pdf_url)

In [None]:
pdf_content = clean_text(extract_text_from_pdf(pdf_content))

In [None]:
def split_text_into_chunks(text, max_chars):
    """
    Split the text into smaller chunks, each within the max_chars limit.
    """
    chunks = []
    for i in range(0, len(text), max_chars):
        chunk = text[i:i + max_chars]
        chunks.append(chunk)
    return chunks
def query_gpt4all_for_chunk(chunk, query):
    """
    Query the GPT4All model for a single chunk of text.
    """
    model = GPT4All(model_name="Meta-Llama-3-8B-Instruct.Q4_0.gguf")
    prompt = f"The following is a chunk of the content of a PDF:\n\n{chunk}\n\nAnswer the following question based on this chunk:\n\n{query}"
    response = model.generate(prompt)
    return response

# Contextualizing Input for Model Inference

In [None]:
MAX_CHARS = 1000
model = GPT4All(model_name="Meta-Llama-3-8B-Instruct.Q4_0.gguf")
if pdf_content:
    # Step 2: Split the PDF content into chunks
    chunks = split_text_into_chunks(pdf_content, MAX_CHARS)

    # Step 3: Query the GPT4All model for each chunk and gather responses
    question = "What is the document about?"
    all_responses = []

    for i, chunk in enumerate(chunks):
        print(f"\nProcessing chunk {i + 1}/{len(chunks)}...")
        response = query_gpt4all_for_chunk(chunk, question)
        all_responses.append(response)

    # Step 4: Combine and display the responses
    combined_response = "\n".join(all_responses)
    print("\nCombined GPT-4All Responses:\n", combined_response)

In [None]:
question="What are the technological gaps in this ITU document?"
gpt_response= query_gpt4all(question, question)
print('response:',gpt_response)