In [2]:
!pip install llama-index llama-index-graph-stores-neo4j graspologic numpy==1.24.4 scipy==1.12.0 future

Collecting llama-index
  Using cached llama_index-0.12.34-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-graph-stores-neo4j
  Using cached llama_index_graph_stores_neo4j-0.4.6-py3-none-any.whl.metadata (694 bytes)
Collecting graspologic
  Using cached graspologic-3.4.1-py3-none-any.whl.metadata (5.8 kB)
Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.6 kB)
Collecting scipy==1.12.0
  Downloading scipy-1.12.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (112 kB)
Collecting future
  Using cached future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting llama-index-agent-openai<0.5,>=0.4.0 (from llama-index)
  Using cached llama_index_agent_openai-0.4.6-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5,>=0.4.1 (from llama-index)
  Using cached llama_index_cli-0.4.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13,>=0.12.34 (from llama-index)
  Downloading llama_index_core-0.12.34.post1-py3-none

In [1]:
import pandas as pd
from llama_index.core import Document 

news = pd.read_csv(
    "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv"
)[:50]
news.head()

Unnamed: 0,title,date,text
0,Chevron: Best Of Breed,2031-04-06T01:36:32.000000000+00:00,JHVEPhoto Like many companies in the O&G secto...
1,FirstEnergy (NYSE:FE) Posts Earnings Results,2030-04-29T06:55:28.000000000+00:00,FirstEnergy (NYSE:FE – Get Rating) posted its ...
2,Dáil almost suspended after Sinn Féin TD put p...,2023-06-15T14:32:11.000000000+00:00,The Dáil was almost suspended on Thursday afte...
3,Epic’s latest tool can animate hyperrealistic ...,2023-06-15T14:00:00.000000000+00:00,"Today, Epic is releasing a new tool designed t..."
4,"EU to Ban Huawei, ZTE from Internal Commission...",2023-06-15T13:50:00.000000000+00:00,The European Commission is planning to ban equ...


In [7]:
documents = [
    Document(text=f"{row['title']}: {row['text']}")
    for i, row in news.iterrows()
]
print(documents[0])

Doc ID: 11fc9238-eaec-466a-bf41-607cf8dd6498
Text: Chevron: Best Of Breed: JHVEPhoto Like many companies in the O&G
sector, the stock of Chevron (NYSE:CVX) has declined about 10% over
the past 90-days despite the fact that Q2 consensus earnings estimates
have risen sharply (~25%) during that same time frame. Over the years,
Chevron has kept a very strong balance sheet. That allowed the...


In [2]:
from llama_index.llms.ollama import Ollama

llm = Ollama(
    model = "llama3",
    base_url = "http://localhost:11434",
    request_timeout = 120.0
)


In [4]:
import asyncio 
import nest_asyncio

nest_asyncio.apply()

from typing import Any, List, Callable, Optional, Union, Dict 
from IPython.display import Markdown, display

from llama_index.core.async_utils import run_jobs
from llama_index.core.indices.property_graph.utils import (
    defaults_parse_triplets_fn,
)
from llama_index.core.graph_stores.types import (
    EntityNode, 
    KG_NODES_KEY,
    KG_RELATIONS_KEY, 
    Relation
)
from llama_index.core.llms.llm import LLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core.prompts.default_prompts import (
    DEFAULT_KG_TRIPLET_EXTRACT_PROMPT, 
)
from llama_index.core.schema import TransformComponent, BaseNode
from llama_index.core.bridge.pydantic import BaseModel, Field 

class GraphRAGExtractor(TransformComponent):
    llm: LLM
    extract_prompt: PromptTemplate
    parse_fn: Callable
    num_workers: int
    max_path_per_chunk: int

    def __init__(
        self, 
        llm: Optional[LLM] = None, 
        extract_prompt: Optional[Union[str, PromptTemplate]] = None,
        parse_fn: Callable = default_parse_triplets_fn,
        max_paths_per_chunk: int = 10,
        num_workers: int = 4,
    ) -> None:
        from llama_index.core import Settings

        if isinstance(extract_prompt, str):
            extract_prompt = PromptTemplate(extract_prompt)

        super().__init__(
            llm = llm or Settings.llm,
            extract_prompt = extract_prompt or DEFAULT_KG_TRIPLET_EXTRACT_PROMPT,
            parse_fn = parse_fn,
            num_workers = num_workers,
            max_paths_per_chunk = max_paths_per_chunk
        )

    @classmethod 
    def class_name(cls) -> str:
        return "GraphExtractor"

    def __call__(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        return asyncio.run(
            self.acall(nodes, show_progress=show_progress, **kwargs)
        )

    async def acall(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[Node]:
        jobs = []
        for node in nodes:
            jobs.appends(self._aextract(node))
        return await run_jobs(
            jobs, 
            workers = self.num_workers,
            show_progress = show_progress,
            desc = "Extracting paths from text"
        )

    async def _aextract(self, node: BaseNode) -> BaseNode:
        assert hasattr(node, "text)

        text = node.get_content(metadata_mode = "llm")
        try:
            llm_response = await self.llm.apredict(
                self.extract_prompt, 
                text=text, 
                max_knowledge_triplets = self.max_paths_per_chunk
            )
        except ValueError: 
            entities = []
            entities_relationship = []


    
    