# Knowledge Graph RAG

In this notebook we are going to show you how to build a Knowledge Graph RAG

In [1]:
import hybridagi.core.datatypes as dt

input_data = \
[
    {
        "Title": "The Shawshank Redemption",
        "Year Produced": 1994,
        "Actors": ["Tim Robbins", "Morgan Freeman"],
        "Directors": ["Frank Darabont"],
        "Genres": ["Drama", "Crime"],
        "Ratings": 9.3
    },
    {
        "Title": "The Godfather",
        "Year Produced": 1972,
        "Actors": ["Marlon Brando", "Al Pacino", "Diane Keaton"],
        "Directors": ["Francis Ford Coppola"],
        "Genres": ["Crime", "Drama"],
        "Ratings": 9.2
    },
    {
        "Title": "The Godfather: Part II",
        "Year Produced": 1974,
        "Actors": ["Al Pacino", "Robert De Niro", "Diane Keaton"],
        "Directors": ["Francis Ford Coppola"],
        "Genres": ["Crime", "Drama"],
        "Ratings": 9.0
    },
    {
        "Title": "The Dark Knight",
        "Year Produced": 2008,
        "Actors": ["Christian Bale", "Heath Ledger", "Maggie Gyllenhaal"],
        "Directors": ["Christopher Nolan"],
        "Genres": ["Action", "Crime", "Drama"],
        "Ratings": 9.0
    },
    {
        "Title": "Pulp Fiction",
        "Year Produced": 1994,
        "Actors": ["John Travolta", "Uma Thurman", "Samuel L. Jackson"],
        "Directors": ["Quentin Tarantino"],
        "Genres": ["Crime", "Drama"],
        "Ratings": 8.9
    },
    {
        "Title": "Schindler's List",
        "Year Produced": 1993,
        "Actors": ["Liam Neeson", "Ralph Fiennes", "Embeth Davidtz"],
        "Directors": ["Steven Spielberg"],
        "Genres": ["Drama", "History", "War"],
        "Ratings": 8.9
    },
    {
        "Title": "12 Angry Men",
        "Year Produced": 1957,
        "Actors": ["Henry Fonda", "Lee J. Cobb", "Ed Begley"],
        "Directors": ["Sidney Lumet"],
        "Genres": ["Drama"],
        "Ratings": 8.9
    },
    {
        "Title": "The Lord of the Rings: The Return of the King",
        "Year Produced": 2003,
        "Actors": ["Elijah Wood", "Viggo Mortensen", "Cate Blanchett"],
        "Directors": ["Peter Jackson"],
        "Genres": ["Adventure", "Drama", "Fantasy"],
        "Ratings": 8.9
    },
    {
        "Title": "Forrest Gump",
        "Year Produced": 1994,
        "Actors": ["Tom Hanks", "Robin Wright", "Sally Field"],
        "Directors": ["Robert Zemeckis"],
        "Genres": ["Drama", "Romance"],
        "Ratings": 8.8
    },
    {
        "Title": "Inception",
        "Year Produced": 2010,
        "Actors": ["Leonardo DiCaprio", "Joseph Gordon-Levitt", "Ellen Page"],
        "Directors": ["Christopher Nolan"],
        "Genres": ["Action", "Adventure", "Sci-Fi"],
        "Ratings": 8.8
    }
]

input_facts = dt.FactList()

for data in input_data:
    movie = dt.Entity(name=data["Title"], label="Movie")
    year = dt.Entity(name=str(data["Year Produced"]), label="Year")
    input_facts.facts.append(dt.Fact(subj=movie, rel=dt.Relationship(name="Produced in"), obj=year))
    for actor in data["Actors"]:
        actor_entity = dt.Entity(name=actor, label="Actor")
        input_facts.facts.append(dt.Fact(subj=actor_entity, rel=dt.Relationship(name="Played in"), obj=movie))
    for director in data["Directors"]:
        director_entity = dt.Entity(name=actor, label="Director")
        input_facts.facts.append(dt.Fact(subj=movie, rel=dt.Relationship(name="Directed by"), obj=director_entity))
    for genre in data["Genres"]:
        genre_entity = dt.Entity(name=genre, label="Genre")
        input_facts.facts.append(dt.Fact(subj=movie, rel=dt.Relationship(name="Has genre"), obj=genre_entity))
    rating = dt.Entity(name=str(data["Ratings"]), label="Ratings")
    input_facts.facts.append(dt.Fact(subj=movie, rel=dt.Relationship(name="Has ratings of"), obj=rating))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from hybridagi.memory.integration.local import LocalFactMemory

fact_memory = LocalFactMemory(index_name="test")
fact_memory.update(input_facts)

fact_memory.show()

test_fact_memory.html


#### A simple pipeline: deduplicating entities

You will notice that each graph created by our input data are not connected, this arise when the entities are duplicated. To solve this issue, we are going to apply a very simple pipeline with one step that deduplicate the entities using their names and labels.

In [3]:
from hybridagi.core.pipeline import Pipeline
from hybridagi.modules.deduplicator import EntityDeduplicator
from hybridagi.modules.embedders import 

pipeline = Pipeline()

pipeline.add("deduplicate_entities", EntityDeduplicator(method="exact"))
# pipeline.add("embed_entities", E)

output_facts = pipeline(input_facts)

fact_memory.update(output_facts) # Update the fact memory with our cleaned data

fact_memory.show()

test_fact_memory.html
