In [1]:
import os
from typing import List

import instructor
from dotenv import load_dotenv
from nltk import sent_tokenize
from openai import OpenAI
from pydantic import BaseModel, Field

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
class Entity(BaseModel):
    entity: str = Field(
        ...,
        description="An entity in the text. Keep it short and simple. Pronouns should be resolved before adding it to the list of entities",
    )


class EntityRelations(BaseModel):
    first_entity: str = Field(..., description="The first entity in the relationship.")
    relationship: str = Field(
        ..., description="The relationship between the two entities, should be a verb."
    )
    second_entity: str = Field(
        ..., description="The second entity in the relationship."
    )


class KnowledgeGraphBuilder:
    def __init__(self, entity_extractor, relations_extractor):
        self.entity_extractor = entity_extractor
        self.relations_extractor = relations_extractor
        self.relations = []

    def extract_entities(self, text):
        return self.entity_extractor(text=text)

    def extract_and_add_relations(self, text, entities):
        list_of_entities = [entity.entity for entity in entities]
        sentences = sent_tokenize(text)
        for sentence in sentences:
            existing_entities = ",".join(list_of_entities)
            response = self.relations_extractor(
                text=sentence, existing_entities=existing_entities
            )

            for entity_rel in response:
                if (
                    entity_rel.first_entity in list_of_entities
                    and entity_rel.second_entity in list_of_entities
                ):
                    self.relations.append(entity_rel)

    def generate_mermaid(self):
        mermaid_code = ["```mermaid", "graph TD;"]
        for relation in self.relations:
            mermaid_code.append(
                f'    {relation.first_entity.replace(" ", "_")}-->|{relation.relationship}|{relation.second_entity.replace(" ", "_")}'
            )
        mermaid_code.append("```")
        return "\n".join(mermaid_code)

    def save_mermaid_to_file(self, filename="knowledge_graph.md"):
        mermaid_code = self.generate_mermaid()
        with open(filename, "w") as f:
            f.write(mermaid_code)
        print(f"Mermaid diagram saved to {filename}")


def configure_system(model: str = "gpt-4o-mini"):
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("API key for OpenAI is not set in environment variables.")
    llm = instructor.patch(OpenAI())

    def entity_extractor(text):
        entities = llm.chat.completions.create(
            model=model,
            response_model=List[Entity],
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Extract entities from the text: {text}"},
            ],
        )
        return entities

    def relations_extractor(text, existing_entities):
        all_entities = ",".join(existing_entities)
        entities_relations = llm.chat.completions.create(
            model=model,
            response_model=List[EntityRelations],
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": f"Extract relationships from the text: {text} with entities: {all_entities}",
                },
            ],
        )
        return entities_relations

    return entity_extractor, relations_extractor

In [5]:
entity_extractor, relations_extractor = configure_system()
builder = KnowledgeGraphBuilder(entity_extractor, relations_extractor)

text = """
Dario Amodei

Born: 1983 (age 40-41)
Citizenship: USA
Alma mater: Caltech, Stanford University, Princeton University (PhD)
Known for: Co-founder / CEO of Anthropic

Scientific career:
Fields: Artificial intelligence
Institutions: Stanford University School of Medicine, OpenAI, Anthropic
Thesis: "Network-Scale Electrophysiology: Measuring and Understanding the Collective Behavior of Neural Circuits" (2011)
Doctoral advisors: Michael J. Berry, William Bialek

Dario Amodei (born 1983) is an Italian-American artificial intelligence researcher and entrepreneur. He is the co-founder and CEO of Anthropic, the company behind the large language model series Claude AI. He was previously the vice president of research at OpenAI.

Education:
Dario grew up in San Francisco and graduated from Lowell High School. Amodei began his undergraduate studies at Caltech, where he worked with Tom Tombrello as one of Tombrello's Physics 11 students. He later transferred to Stanford University, where he earned his undergraduate degree in physics. He also holds a PhD in physics from Princeton University, where he studied electrophysiology of neural circuits. He was a postdoctoral scholar at the Stanford University School of Medicine.

Career:
From November 2014 until October 2015, he worked at Baidu. After that, he worked at Google. In 2016, Amodei joined OpenAI.

In 2021, Amodei and his sister Daniela founded Anthropic along with other former senior members of OpenAI. The Amodei siblings were among those who left OpenAI due to directional differences, specifically regarding OpenAI's ventures with Microsoft in 2019.

In July 2023, Amodei warned a United States Senate judiciary panel of the dangers of AI, including the risks it poses in the development and control of weaponry.

In September 2023, Amodei and his sister Daniela were named as two of the TIME 100 Most Influential People in AI (TIME100 AI).

In November 2023, the board of directors of OpenAI approached Amodei about replacing Sam Altman and potentially merging the two startups. Amodei declined both offers.
"""

entities = builder.extract_entities(text)
builder.extract_and_add_relations(text, entities)
builder.save_mermaid_to_file(filename="Dario Amodei.md")

Mermaid diagram saved to Dario Amodei


In [7]:
entity_extractor, relations_extractor = configure_system()
builder = KnowledgeGraphBuilder(entity_extractor, relations_extractor)
text = """
Breaking Bad is a critically acclaimed American television drama series that aired from 2008 to 2013. Created by Vince Gilligan, the show follows the transformation of Walter White, a mild-mannered high school chemistry teacher, into a ruthless methamphetamine manufacturer and drug kingpin.
The series is set in Albuquerque, New Mexico, and begins when Walter is diagnosed with terminal lung cancer. Faced with mounting medical bills and a desire to secure his family's financial future before he dies, Walter teams up with a former student, Jesse Pinkman, to cook and sell high-quality methamphetamine. What starts as a desperate plan to make quick money evolves into a dark journey that pushes Walter to embrace his alter ego, "Heisenberg," and delve deeper into the criminal underworld.
Throughout its five-season run, Breaking Bad explores themes of morality, family, power, and the consequences of one's choices. The show is renowned for its complex characters, intense plotlines, and the gradual evolution of Walter White from protagonist to antagonist. It also features a talented ensemble cast, including Bryan Cranston as Walter White and Aaron Paul as Jesse Pinkman, whose performances earned them multiple awards and accolades.
Breaking Bad's impact on popular culture has been significant, inspiring spin-offs like Better Call Saul and El Camino, and setting a new standard for television storytelling. Its exploration of the blurred lines between good and evil, combined with its cinematic visuals and tightly crafted narrative, has solidified its place as one of the most influential TV series of the 21st century.
"""

entities = builder.extract_entities(text)
builder.extract_and_add_relations(text, entities)
builder.save_mermaid_to_file(filename="Breaking Bad.md")

Mermaid diagram saved to Breaking Bad.md
