<a href="https://colab.research.google.com/github/NiekVerhoeff/workshop/blob/main/rag_on_zipfile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install packages

!pip install llama-index
!pip install docx2txt
!pip install torch transformers python-pptx Pillow
%pip install llama-index-readers-web
%pip install llama-index-program-openai

In [None]:
#@title Upload a zipfile with documents
#@markdown supported extensions are: .txt .csv .xml .pdf .docx .pptx

from google.colab import files
import zipfile
import os

# Upload the ZIP file
uploaded = files.upload()  # Select and upload the ZIP file

# Assuming there's only one ZIP file uploaded, get its filename
zip_filename = next(iter(uploaded.keys()))

# Extract the ZIP file
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall()

print("Folder structure has been extracted.")


In [None]:
#@title Initialize things

from llama_index.core import SimpleDirectoryReader
import nest_asyncio

nest_asyncio.apply()

import os
import openai
from google.colab import userdata
openai.api_key = userdata.get('OPENAI_API_KEY')
from pydantic import BaseModel, Field
from typing import List
from typing import Dict
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.core.extractors import PydanticProgramExtractor
from llama_index.core.node_parser import SentenceSplitter

from llama_index.core.ingestion import IngestionPipeline

In [None]:
#@title Load data

reader = SimpleDirectoryReader(
    input_dir="./test_colab_upload",
    recursive=True,
)

reader2 = SimpleDirectoryReader(
    input_dir="./test_colab_upload",
    recursive=False,
)

all_docs = []

for docs in reader2.iter_data():
    for doc in docs:
        # do something with the doc
        doc.text = doc.text.upper()
        all_docs.append(doc)

for docs in reader.iter_data():
    for doc in docs:
        # do something with the doc
        doc.text = doc.text.upper()
        all_docs.append(doc)

print(len(all_docs))

In [None]:
#@title Define extracted elements

#@markdown

class NodeMetadata(BaseModel):
    """Node metadata."""

    entities: List[str] = Field(
        ..., description="Maak voor iedere entiteit in dit stuk tekst een string in de vorm van een valide python dicrionary waarbij je de entiteit als value neemt en een key verzint. Entiteiten die van eenzelfe type zijn, geef je dezelfde key"
    )
    description: str = Field(
        ..., description="Maak een archiefbeschrijving van dit stuk tekst"
    )
    contains_number: bool = Field(
        ...,
        description=(
            "Whether the text chunk contains any numbers (ints, floats, etc.)"
        ),
    )

In [None]:
#from llama_index.program.openai import OpenAIPydanticProgram
#from llama_index.core.extractors import PydanticProgramExtractor
#@title Setup extractor
EXTRACT_TEMPLATE_STR = """\
Here is the content of the section:
----------------
{context_str}
----------------
Given the contextual information, extract out a {class_name} object.\
"""

openai_program = OpenAIPydanticProgram.from_defaults(
    output_cls=NodeMetadata,
    prompt_template_str="{input}",
    # extract_template_str=EXTRACT_TEMPLATE_STR
)

program_extractor = PydanticProgramExtractor(
    program=openai_program, input_key="input", show_progress=True
)

In [None]:
#from llama_index.core.node_parser import SentenceSplitter

#from llama_index.core.ingestion import IngestionPipeline
#@title Extract away!

node_parser = SentenceSplitter(chunk_size=1024)

pipeline = IngestionPipeline(transformations=[node_parser, program_extractor])


orig_nodes = pipeline.run(documents=all_docs)
sample_entry = program_extractor.extract(orig_nodes)
display(sample_entry)