In [1]:
from langchain_community.graphs import Neo4jGraph
import pandas as pd
from pyprojroot import here
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
NEO4J_URL = os.environ['NEO4J_URL']
NEO4J_USERNAME = os.environ['NEO4J_USERNAME']
NEO4J_PASSWORD = os.environ['NEO4J_PASSWORD']
NEO4J_DATABASE = os.environ['NEO4J_DATABASE']

In [3]:
graph = Neo4jGraph(url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE)

Clear the Neo4j Database

In [9]:
graph.query('MATCH (n) DETACH DELETE n')

[]

Nodes:

House: Represents a house. Each house node has attributes such as: house_id (a unique identifier for the   house), price (the price of the house), bedrooms (number of bedrooms), bathrooms (number of bathrooms), sqft_living (square footage of living space), sqft_lot (square footage of the lot), floors (number of floors), condition (condition of the house), sqft_basement (square footage of the basement), yr_built (year the house was built), yr_renovated (year the house was renovated).

Street: Represents a street. Each street node has a single attribute: name (name of the street).

City: Represents a city. Each city node has a single attribute: name (name of the city).

Relationships:

LOCATED_ON: A directional relationship from a House node to a Street node, signifying that the house is located on that street.

LOCATED_IN: A directional relationship from a House node to a City node, signifying that the house is located in that city.

Instructions in the Script:

LOAD CSV WITH HEADERS: Loads a CSV file that contains the house data with headers indicating each column's purpose.

MERGE: Ensures that a node or relationship is created if it does not already exist; otherwise, it matches the existing node or relationship. This prevents duplication.

SET: Assigns properties to the nodes after they have been created or matched.

In [10]:
home_app_csv_path = here("data/home_app_data.csv")
home_app_df = pd.read_csv(home_app_csv_path)

In [11]:
display(home_app_df.head())
print(home_app_df.shape)
print("Data shape:", home_app_df.shape)

Unnamed: 0,house_id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,sqft_basement,yr_built,yr_renovated,street,city
0,0,313000,3,2,1340,7912,1,3,0,1955,2005,18810 Densmore Ave N,Shoreline
1,1,2384000,5,3,3650,9050,2,5,280,1921,0,709 W Blaine St,Seattle
2,2,342000,3,2,1930,11947,1,4,0,1966,0,26206-26214 143rd Ave SE,Kent
3,3,420000,3,2,2000,8030,1,4,1000,1963,0,857 170th Pl NE,Bellevue
4,4,550000,4,3,1940,10500,1,4,800,1976,1992,9105 170th Ave NE,Redmond


(4600, 13)
Data shape: (4600, 13)


Construct the knowlage graph and store the data in the Graph database

In [12]:
# Define the query string with placeholders for parameters
query = """
LOAD CSV WITH HEADERS FROM 'file:///' + $home_app_directory AS row    // Load CSV data from file  

// Create a House Node and merge it with the id
MERGE (h:House {id: row.house_id})                           
// Set properties for House node:
SET h.price = toInteger(row.price),
    h.bedrooms = toInteger(row.bedrooms),
    h.baths = toInteger(row.bathrooms),
    h.sqft_living = toInteger(row.sqft_living),
    h.sqft_lot = toInteger(row.sqft_lot),
    h.floors = toFloat(row.floors),
    h.condition = toInteger(row.condition),
    h.sqft_basement = toInteger(row.sqft_basement),
    h.yr_built = toInteger(row.yr_built),
    h.yr_renovated = toInteger(row.yr_renovated)

// Create a Street Node
MERGE (s:Street {name: row.street})

// Create a City Node
MERGE (c:City {name: row.city})

// Create relationships
MERGE (h)-[:LOCATED_ON]->(s)
MERGE (h)-[:LOCATED_IN]->(c)
"""
# Execute the query with the parameter
graph.query(query, params={"home_app_directory": str(home_app_csv_path)})

[]

In [7]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Home {condition: INTEGER, sqft_basement: INTEGER, yr_built: INTEGER, yr_renovated: INTEGER, sqft_living: INTEGER, sqft_lot: INTEGER, floors: FLOAT, id: STRING, price: INTEGER, bedrooms: INTEGER, baths: INTEGER}
Street {name: STRING}
City {name: STRING}
Relationship properties:

The relationships:
(:Home)-[:LOCATED_IN]->(:City)
(:Home)-[:LOCATED_ON]->(:Street)


Check the number of nodes that were created from the data

In [8]:
# Match all nodes in the graph
cypher = """
    MATCH (n)
    RETURN count(n)
"""
result = graph.query(cypher)
result

[{'count(n)': 9169}]