In [10]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM

# setup prompts - specific to StableLM
from llama_index.prompts import PromptTemplate

# This will wrap the default prompts that are internal to llama-index
# taken from https://huggingface.co/Writer/camel-5b-hf
query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

In [3]:
import os
import openai

# os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
# openai.api_key = os.environ["OPENAI_API_KEY"]
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, set_global_service_context

embed_model = OpenAIEmbedding(embed_batch_size=10)

service_context = ServiceContext.from_defaults(embed_model=embed_model)

# optionally set a global service context
set_global_service_context(service_context)

In [4]:
from dotenv import load_dotenv
load_dotenv()

from llama_index.llms import OpenAI

llm = OpenAI(temperature=0,
             api_key='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')

In [5]:
# documents = SimpleDirectoryReader('C:/Users/user/Desktop/업무/경상업무/녹음파일/베타테스트_인터뷰/인터뷰STT/').load_data()
documents = SimpleDirectoryReader(
    input_files=["/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN논문/A GNN RNN Approach for Harnessing Geospatial and Temporal Information Application to Crop Yeild Prediction.pdf"]
).load_data()
    
service_context = ServiceContext.from_defaults(chunk_size=256, llm=llm)
index = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context
)
index.storage_context.persist(persist_dir="./storage/march")

query_engine = index.as_query_engine(service_context=service_context,
                                     similarity_top_k=10)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [15]:
from llama_index import StorageContext
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss
# dimensions of text-ada-embedding-002
d = 384
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context
)
index.storage_context.persist(persist_dir="../Retrieval_Database/test_pdf_faiss")

AttributeError: type object 'VectorStoreIndex' has no attribute 'from_persist_dir'

In [None]:
from llama_index import (
    load_index_from_storage,
    StorageContext,
)
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("../Retrieval_Database/test_pdf_faiss")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    persist_dir="../Retrieval_Database/test_pdf_faiss"
)
index = load_index_from_storage(storage_context=storage_context)

In [13]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import IndexNode

node_parser=SimpleNodeParser.from_defaults(chunk_size=256)
base_nodes = node_parser.get_nodes_from_documents(documents)

sub_chunk_sizes = [64, 128]
sub_node_parsers = [
    SimpleNodeParser.from_defaults(chunk_size=c, chunk_overlap=0) for c in sub_chunk_sizes
]

all_nodes = []
for base_node in base_nodes:
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)

    # also add original node to node
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

ValueError: Metadata length (70) is longer than chunk size (64). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.

In [None]:
all_nodes_dict = {n.node_id: n for n in all_nodes}
len(all_nodes_dict)

In [None]:
vector_index_chunk = VectorStoreIndex(
    all_nodes, service_context=service_context
)

In [None]:
from llama_index import Document
# docs = [Document(text=d.get_content(), metadata=d.metadata.pop()) for d in docs]
docs = []
for d in all_nodes:
    metadatas_={}
    metadatas_['page_label'] = d.metadata['page_label']
    metadatas_['file_path'] = d.metadata['file_path']
    docs.append(Document(text=d.get_content(), metadata=metadatas_))#, id=d.id_, index_id=d.index_id))

In [None]:
from llama_index import StorageContext
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss
# dimensions of text-ada-embedding-002
d = 384
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context
)
index.storage_context.persist(persist_dir="../Retrieval_Database/test_pdf_faiss")

In [None]:
vector_retriever_chunk = index.as_retriever(similarity_top_k=10)
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=10)
retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=False,
)

In [None]:
from llama_index import (
    load_index_from_storage,
    StorageContext,
)
# load index from disk
vector_store = FaissVectorStore.from_persist_dir("../Retrieval_Database/test_pdf_faiss")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    persist_dir="../Retrieval_Database/test_pdf_faiss"
)
index = load_index_from_storage(storage_context=storage_context)

In [6]:
res = query_engine.query('what is dropedge?')

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [9]:
res.response

'DropEdge is not mentioned in the provided context information.'

# small to big retrieval

# bm25

In [21]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.retrievers import BM25Retriever
from llama_index.indices.vector_store.retrievers.retriever import (
    VectorIndexRetriever,
)
from llama_index.llms import OpenAI
nodes = service_context.node_parser.get_nodes_from_documents(documents)
retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)

In [22]:
from llama_index.response.notebook_utils import display_source_node

# will retrieve context from specific companies
nodes = retriever.retrieve("What happened at Viaweb and Interleaf?")
for node in nodes:
    display_source_node(node)

**Node ID:** c090ab70-2815-40ff-a52f-7cb34361fd4d<br>**Similarity:** 3.7285354557593466<br>**Text:** Soil Survey Staff. 2020. Gridded Soil Survey Geographic
(gSSURGO) Database for the Conterminous U...<br>

**Node ID:** fb0ed3ff-66b4-451e-81c1-60e626698b98<br>**Similarity:** 2.976186845261263<br>**Text:** yield prediction. Computers and Electronics in Agriculture,
127: 467–474.
Nevavuori, P.; Narra, N...<br>

In [23]:
nodes = retriever.retrieve("What is Graph Neural Network?")
for node in nodes:
    display_source_node(node)

**Node ID:** 35125356-2ad1-41d1-b7de-e1e60a6a2741<br>**Similarity:** 9.748103849921955<br>**Text:** Figure 1:
Left: The CNN model used for per-year embedding extraction. Right: Our overall GNN-RNN ...<br>

**Node ID:** fb0ed3ff-66b4-451e-81c1-60e626698b98<br>**Similarity:** 3.9001567486205326<br>**Text:** yield prediction. Computers and Electronics in Agriculture,
127: 467–474.
Nevavuori, P.; Narra, N...<br>

## node id 커스터마이징 할 수 있어야 함

In [8]:
import os
from dotenv import load_dotenv
load_dotenv() # It must be before llama_index import
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage

def build_storage(data_dir, persist_dir):
    documents = SimpleDirectoryReader(data_dir).load_data()
    index = GPTVectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir)
    return index

def read_from_storage(persist_dir):
    storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
    return load_index_from_storage(storage_context)

def adding_data_to_GPT():
    persist_dir = "./storage"
    data_dir = "./data"
    index = None
    if os.path.exists(persist_dir):
        index = read_from_storage(persist_dir)
    else:
        index = build_storage(data_dir, persist_dir)
        query_engine = index.as_query_engine()

    response = query_engine.query(
        "When did Ran Bar-Zik create his first pull request in CyberArk?"
    )
    print(response)


if __name__ == "__main__":
    adding_data_to_GPT()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)