In [1]:
import os
import logging
import sys
import nest_asyncio
import json

In [2]:
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    Settings,
    Document, get_response_synthesizer,
    KnowledgeGraphIndex, PropertyGraphIndex,
    StorageContext
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.graph_stores.neo4j import Neo4jGraphStore

In [None]:
cd ../

In [4]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("llama_index").setLevel(logging.INFO)
nest_asyncio.apply()

In [5]:
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.llm = llm
Settings.chunk_size = 512

In [None]:
graph_store = Neo4jGraphStore(
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    url=os.environ["NEO4J_URI"]
)

In [None]:
graph_store.query(
"""
MATCH (n) DETACH DELETE n
"""
)

In [9]:
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [10]:
data = []
with open('data/annotations.jsonl', 'r') as f:
    for line in f:
        line = json.loads(line)
        for doc in line['documents']:
            data.append(doc["text"])

In [None]:
len(data)

In [None]:
data = [Document(text=doc) for doc in data]
len(data)

In [13]:
splitter = SentenceSplitter(separator=".")

In [None]:
chunks = splitter.get_nodes_from_documents(data, show_progress=True)
len(chunks)

In [None]:
index = KnowledgeGraphIndex(nodes=chunks[0:1000], # only process the first 1000 chunks 
                            storage_context=storage_context, 
                            include_embeddings=False,
                            show_progress=True)

In [12]:
query_engine = index.as_query_engine(
    include_text=False, response_mode="tree_summarize"
)

In [13]:
response = query_engine.query("Tell me more about Interleaf")

In [None]:
response

In [15]:
query_engine = index.as_query_engine(
    include_text=True,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=5
)

In [None]:
response = query_engine.query(
    "Tell me more about what the author worked on at Interleaf"
)

In [None]:
response