In [4]:
import os
import weaviate
# from weaviate.classes.init import Auth
from dotenv import load_dotenv
load_dotenv()

weaviate_url = os.getenv("WEAVIATE_URL") 
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")
HF_TOKEN = os.getenv("HF_TOKEN")



client = weaviate.Client(
    url=weaviate_url , auth_client_secret=weaviate.AuthApiKey(weaviate_api_key),
    additional_headers={
         "X-HuggingFace-Api-Key": HF_TOKEN
    },
)

print(client.is_ready())


WeaviateStartUpError: Weaviate did not start up in 5 seconds. Either the Weaviate URL https://07g2uev0rwmp0dhmx39ivq.c0.asia-southeast1.gcp.weaviate.cloud is wrong or Weaviate did not start up in the interval given in 'startup_period'.

In [36]:

client.schema.get()

{'classes': []}

In [37]:

client.schema.delete_all()

In [38]:
schema = {
    "classes": [
        {
            "class": "RAG",
            "description": "Documents for RAG",
            "vectorizer": "text2vec-huggingface",
            "moduleConfig": {"text2vec-huggingface": {"model": "sentence-transformers/all-MiniLM-L6-v2", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-huggingface": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

     

client.schema.create(schema)
     

client.schema.get()

{'classes': [{'class': 'RAG',
   'description': 'Documents for RAG',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-huggingface': {'model': 'sentence-transformers/all-MiniLM-L6-v2',
     'type': 'text',
     'useCache': True,
     'useGPU': False,
     'vectorizeClassName': True,
     'waitForModel': False}},
   'multiTenancyConfig': {'autoTenantActivation': False,
    'autoTenantCreation': False,
    'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'The content of the paragraph',
     'indexFilterable': True,
     'indexRangeFilters': False,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-huggingface': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'content',
     'tokenization': 'word'}],
   'replicationConfig': {'asyncEnabled': False,
    'deletionStrategy': 'NoAutomate

In [39]:
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
     

retriever = WeaviateHybridSearchRetriever(
    alpha = 0.5,               # defaults to 0.5, which is equal weighting between keyword and semantic search
    client = client,           # keyword arguments to pass to the Weaviate client
    index_name = "RAG",  # The name of the index to use
    text_key = "content",         # The name of the text key to use
    attributes = [], # The attributes to return in the results
    create_schema_if_missing=True,
)

In [40]:
from langchain_community.document_loaders import PyPDFLoader
     

loader = PyPDFLoader("data.pdf")
docs = loader.load()
print(docs)

# from langchain.text_splitter import RecursiveCharacterTextSplitter
     

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)
     

# docs = text_splitter.split_documents(docs)


print(len(docs))
     

[Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2023-08-03T16:08:03+05:30', 'moddate': '2023-08-03T16:08:53+05:30', 'source': 'data.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='UNIT 5 INTRODUCTION TO ONLINE ANALYTICAL  \nPROCESSING\nStructure         \n5.0 Introduction \n5.1 \n Objectives \n5.2 \n OLAP and its Need\n5.3 \n Characteristics of O\nLAP\n5.4 \n OLAP and Multidim\nensional Analysis\n 5.4.1\n Multidimens\nional Logical Data Modeling and its Users\n 5.4.2\n Multidimens\nional Structure\n 5.4.3\n Multidimens\nional Operations \n \n5.5 \n OLAP Functions\n5.6 \n Data Warehouse an\nd OLAP: Hypercube and Multicubes\n5.7 \n Applications of OL\nAP\n5.8 \n Steps in the OLAP Creation Process\n5.9 \n Advantages of OLAP\n5.10\n OLAP Architecture\ns - MOLAP, ROLAP, HOLAP, DOLAP \n5.11\n \nSummary\n \n5.12\n Solutions/Answers   \n \n5.13\n Further Readings\n \n5.0 INTRODUCTION \nIn the earlier 

In [41]:
retriever.add_documents(docs)

['34b85dfa-aaf2-4056-b10f-3e4bbd9d1b83',
 'f5831ab3-f60a-4d06-a358-b29ff9b6396a',
 '82f86943-df5a-4cdd-9472-881d79291e76',
 '459cfa5a-c961-4ec3-b824-02b456b14cf4',
 '6dfeb124-e42a-4b4c-84b0-92c21e652b90',
 '0f2b5b95-e332-4eed-b9dc-64e1836131fb',
 'f2f30967-74a2-47c8-a591-82b7271f8489',
 '8389ce49-443f-4e9d-9775-4c7fdb452802',
 '719d1801-5421-490d-a0e8-14fd9befe39e',
 'c40f15b3-3b41-4b7a-bf12-64bc595dd5a5',
 '06e838e1-2928-4a48-9e57-9e1958da8b59',
 'db367324-f83d-4ebb-96fa-bcc0f79ae21a',
 '74d460f8-dc43-4971-b404-cdc4721f2b06',
 '22b98786-5ec4-4198-9588-1c06f91ca760',
 '6469b026-f0e2-4f0c-8e94-66c30090a91f',
 '4b4ea56c-c0ae-4fcd-9d4e-f788af807059',
 '260f7b4d-8051-4994-b048-d13820da4aef',
 '5bcb4952-c488-4fec-b189-5603abc91e20']

In [42]:
print(retriever.invoke("what is the best OLAP practices for creating 3 4 tables ?")[0].page_content)

81
Introduction to Online 
Analytical Processing
5.9 ADVANTAGE S OF OLAP
The SQL functions like Group By, Aggregating functions are quite complex to 
operate in relational databases as compared to multidimensional databases. OLAP 
can pre-compute the queries can save in sub cubes. The hypercubes also make the 
computation task faster and saves time. OLAP has proved to an extremely scalable 
and user – friendly method which is able to perfectly cater to its entire customer 
needs ranging from small to large companies. 
Some listed benefits of using OLAP are as follows:
•	 Data
	
Processing
	
at
	
a
	
faster
	
speed
The speed of query execution has been tremendous since the use of OLAP 
technology and is now counted as one of the primary benefits for it. This prevents 
the customers from spending a lot of time and money on heavy calculations and 
creating complex reports.
•	 Accessibility
The cube enables the various kinds of data like – transactional data from various 
resources, inform

In [76]:
import os
from groq import Groq

# Initialize Groq client with API key
client = Groq(
    api_key="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",
)
from dotenv import load_dotenv
load_dotenv()

# Function to get OLAP best practices based on user query
def get_olap_best_practices(user_query):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"The user has asked: '{user_query}'. Based on this, provide the best OLAP (Online Analytical Processing) practices. you should answer just related to OLAP and dont include any user query info just give best OLAP practices based on user query "
                           "Consider data modeling, indexing, partitioning, query optimization, and performance tuning for large-scale analytical workloads.",
                           
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    
    # Get response text
    response_text = chat_completion.choices[0].message.content

    # Clean up unnecessary formatting (removing ** and #)
    cleaned_response = response_text.replace("**", "").replace("#", "").replace("```","")

    return cleaned_response.strip() 

# Example user query
user_query = "Give me a database table schema for my student management system"
response = get_olap_best_practices(user_query)
import re

def clean_text(text):
    """Remove problematic escape sequences and extra formatting."""
    text = text.replace("**", "").replace("#", "")  # Remove markdown formatting
    text = text.replace("\\", "")  # Remove unnecessary backslashes
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

# Use the function before invoking the retriever
cleaned_response = clean_text(response)
print(retriever.invoke(cleaned_response)[0].page_content)


88
Etl , O l AP and t rends
Features ROLAP MOLAP HOLAP
Storage space 
requirement
Data is stored in 
relational tables. 
Comparatively 
Large storage 
space requirement 
Data is stored in 
multidimensional 
tables. Medium 
storage space 
requirements
It uses both 
ROLAP, 
MOLAP. Small 
storage space 
requirements. No 
duplicate of data
Latency Low latency High latency Medium latency 
Query response 
time
Slow query 
response time 
Fast query 
response time.
Medium query 
response time 
Volume of data Used for large 
volumes of data
Limited volume of 
data
Can be used in 
both scenarios
Retreival of data Complex SQL 
queries are used
Sparse Matrix is 
used
Both
Data View Static view of data Dynamic view of 
data
Both static and 
dynamic view of 
data
2)
 Limitations of OLA
P cube are:
	 •	 OLAP
	
requires
	
a
	
star/snowflake
	
schema:
	 •	 	There
	is
	a
	limited
	number
	of
	dimensions
	(fields)
	a
	single
	OLAP
	
cube.
	
•	 	It
	is
	nearly	impossible
	to
	access
	transactional
	data
	

In [80]:

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

In [85]:
compressor = CohereRerank(cohere_api_key="nbDqU1hTVxWmXGbLYI6OnYhp4Cx40MZ5hOmO5oKX")
     

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
    )

compressed_docs = compression_retriever.get_relevant_documents(user_query)
compressed_docs

[Document(metadata={'relevance_score': 0.12646118}, page_content='79\nIntroduction to Online \nAnalytical Processing\n5.6  Data Warehouse and OLAP: Hypercube and Multi   \nCubes\nThe OLAP cube is a data structure optimized for very quick data analysis. The OLAP \nCube consists of numeric facts called measures which are categorized by dimensions. \nOLAP Cube is also called the hypercube. So, we can say that multidimensional \nDatabases can we see hypercube and multi cube. Multidimensional cubes have \nsmaller multiple cubes and in hypercube it seems there is one cube as logically all \nthe data seems to be as one unit of cube.  Hypercube have multiple same dimensions \nlogically. The differences of Multi cube and Hyper cube are shown in Table 1 below:Table 1: Differences between Multi cube and Hyper cube\nMulti Cube Hyper Cube\nMetadata Each dimension can belong to \nmany cubes\nEach dimension belongs to one \ncube only\nDimension Not necessary all the dimensions \nshould belong to some

In [None]:
compressed_docs = compression_retriever.get_relevant_documents(user_query)
text_content = "\n\n".join(doc.page_content for doc in compressed_docs)
print(text_content)


79
Introduction to Online 
Analytical Processing
5.6  Data Warehouse and OLAP: Hypercube and Multi   
Cubes
The OLAP cube is a data structure optimized for very quick data analysis. The OLAP 
Cube consists of numeric facts called measures which are categorized by dimensions. 
OLAP Cube is also called the hypercube. So, we can say that multidimensional 
Databases can we see hypercube and multi cube. Multidimensional cubes have 
smaller multiple cubes and in hypercube it seems there is one cube as logically all 
the data seems to be as one unit of cube.  Hypercube have multiple same dimensions 
logically. The differences of Multi cube and Hyper cube are shown in Table 1 below:Table 1: Differences between Multi cube and Hyper cube
Multi Cube Hyper Cube
Metadata Each dimension can belong to 
many cubes
Each dimension belongs to one 
cube only
Dimension Not necessary all the dimensions 
should belong to some cube
Every dimension owned by a 
hypercube
Measure 
Computation
Complex, data can b

In [None]:
def generate_database_schema(user_query, olap_context, llm_res, client):
    chat_completion = client.chat.completions

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 2 files: 100%|██████████| 2/2 [07:42<00:00, 231.13s/it]
Loading checkpoint shards:   0%|          | 0/2 [00:24<?, ?it/s]


KeyboardInterrupt: 

In [17]:
def generate_database_schema(user_query, olap_context, llm_res, client):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"""
                    The user has asked: '{user_query}'.
                    Given the OLAP context: {olap_context}, and considering the following response: {llm_res}, 
                    generate a well-structured relational database schema having all possible tables related to it that includes:
                    
                    - Tables with their respective names
                    - Columns with appropriate data types
                    - Primary and foreign key constraints
                    - Relationships between tables (one-to-one, one-to-many, many-to-many)

                    
                    Ensure the schema is optimized for analytical workloads and adheres to best database design practices.
                """
            }
        ],
        model="llama-3.3-70b-versatile",
        max_tokens=5000,
    )

    # Get response text
    response_text = chat_completion.choices[0].message.content

    # Clean up unnecessary formatting
    cleaned_response = response_text.replace("", "").replace("#", "").replace("```", "")

    # Return the cleaned schema
    return cleaned_response.strip()


In [18]:
from groq import Groq
client = Groq(
    api_key="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",
)

olap_context = """
It uses both
ROLAP,
MOLAP. Small
storage space
requirements. No
duplicate of data
Latency Low latency High latency Medium latency
Query response
time
Slow query
response time
Fast query
response time.
Medium query
response time
Volume of data Used for large
volumes of data
Limited volume of
data
Can be used in
both scenarios
Retreival of data Complex SQL
queries are used
Sparse Matrix is
used
Both
Data View Static view of data Dynamic view of
data
Both static and
dynamic view of
data
2)
 Limitations of OLA
P cube are:
         •       OLAP

requires

a

star/snowflake

schema:
         •              There
        is
        a
        limited
        number
        of
        dimensions
        (fields)
        a
        single
        OLAP

cube.

•               It
        is
        nearly  impossible
        to
        access
        transactional
        data
        in
        the
        OLAP

cube.

•               Changes
        to
        an
        OLAP
        cube
        requires
        a
        full
        update
        of
        the
        cube
        –
        a
"""

llm_res = """
Final Response:
 OLAP Best Practices for Large-Scale Analytical Workloads

 Data Modeling

1. Star and Snowflake Schemas: Use star and snowflake schemas to optimize query performance by reducing the number of joins.
2. Fact and Dimension Tables: Separate fact tables (e.g., sales, inventory) from dimension tables (e.g., date, customer, product) to improve data organization and query efficiency.
3. Use Surrogate Keys: Employ surrogate keys to ensure data consistency and minimize data redundancy.

 Indexing

1. Bitmap Indexes: Utilize bitmap indexes for low-cardinality columns to improve query performance.
2. B-Tree Indexes: Use B-tree indexes for high-cardinality columns to optimize query performance.
3. Composite Indexes: Create composite indexes on frequently used columns to reduce query execution time.

 Partitioning

1. Range Partitioning: Use range partitioning to divide large fact tables into smaller, more manageable segments.
2. List Partitioning: Employ list partitioning to separate data into distinct categories (e.g., by region or product category).
3. Composite Partitioning: Combine range and list partitioning to further optimize query performance.

 Query Optimization

1. Pre-Aggregation: Pre-aggregate data to reduce query execution time and improve performance.
2. Materialized Views: Utilize materialized views to store pre-computed results and reduce query execution time.
3. Query Rewriting: Rewrite queries to optimize performance by reducing the number of joins and subqueries.

 Performance Tuning

1. Regular Maintenance: Regularly maintain the database by updating statistics, rebuilding indexes, and checking for data consistency.
2. Monitor Performance: Monitor query performance and adjust optimization strategies as needed.
3. Data Pruning: Prune unnecessary data to reduce storage costs and improve query performance.

 Additional Recommendations

1. Use Column-Store Indexes: Use column-store indexes to optimize query performance for large-scale analytical workloads.
2. Leverage Parallel Processing: Leverage parallel processing capabilities to improve query performance and reduce execution time.
3. Implement Data Caching: Implement data caching mechanisms to reduce query execution time and improve performance.

By following these OLAP best practices, you can optimize your database design and improve query performance for large-scale analytical workloads. This will enable you to make informed business decisions and drive growth for your online retail platform.

Example OLAP Schema
sql
-- Fact table: sales
CREATE TABLE sales (
  sale_id INT,
  date_key INT,
  customer_key INT,
  product_key INT,
  sales_amount DECIMAL(10, 2)
);

-- Dimension table: date
CREATE TABLE date (
  date_key INT,
  date_description VARCHAR(20)
);

-- Dimension table: customer
CREATE TABLE customer (
  customer_key INT,
  customer_name VARCHAR(50)
);

-- Dimension table: product
CREATE TABLE product (
  product_key INT,
  product_name VARCHAR(50)
);

-- Create star schema
CREATE VIEW sales_star AS
SELECT s.sale_id, s.date_key, s.customer_key, s.product_key, s.sales_amount,
       d.date_description, c.customer_name, p.product_name
FROM sales s
JOIN date d ON s.date_key = d.date_key
JOIN customer c ON s.customer_key = c.customer_key
JOIN product p ON s.product_key = p.product_key;

"""

user_query = "Design a database for an online retail platform tracking sales, inventory, and customer interactions"
ans = generate_database_schema(user_query,olap_context, llm_res, client)
print(ans)

**Online Retail Platform Database Schema**

The following schema is designed to support analytical workloads and is optimized for query performance. It includes tables for sales, inventory, customer interactions, and other relevant data.

 Table: **Customers**
sql
CREATE TABLE Customers (
  CustomerID INT PRIMARY KEY,
  FirstName VARCHAR(50) NOT NULL,
  LastName VARCHAR(50) NOT NULL,
  Email VARCHAR(100) UNIQUE NOT NULL,
  Phone VARCHAR(20),
  Address VARCHAR(200)
);


 Table: **Products**
sql
CREATE TABLE Products (
  ProductID INT PRIMARY KEY,
  ProductName VARCHAR(100) NOT NULL,
  Description TEXT,
  Price DECIMAL(10, 2) NOT NULL,
  Category VARCHAR(50) NOT NULL,
  Brand VARCHAR(50) NOT NULL
);


 Table: **Inventory**
sql
CREATE TABLE Inventory (
  InventoryID INT PRIMARY KEY,
  ProductID INT NOT NULL,
  Quantity INT NOT NULL,
  WarehouseLocation VARCHAR(100) NOT NULL,
  FOREIGN KEY (ProductID) REFERENCES Products(ProductID)
);


 Table: **Orders**
sql
CREATE TABLE Orders (
  OrderI

In [21]:
raw_documents = """ Tables

 1. **Customers**

| Column Name | Data Type | Description |
| --- | --- | --- |
| `customer_id` | `int` | Unique customer identifier (Primary Key) |
| `name` | `varchar(255)` | Customer name |
| `email` | `varchar(255)` | Customer email |
| `phone` | `varchar(20)` | Customer phone number |
| `address` | `text` | Customer address |

 2. **Products**

| Column Name | Data Type | Description |
| --- | --- | --- |
| `product_id` | `int` | Unique product identifier (Primary Key) |
| `name` | `varchar(255)` | Product name |
| `description` | `text` | Product description |
| `price` | `decimal(10, 2)` | Product price |
| `category_id` | `int` | Foreign key referencing the `Categories` table |

 3. **Categories**

| Column Name | Data Type | Description |
| --- | --- | --- |
| `category_id` | `int` | Unique category identifier (Primary Key) |
| `name` | `varchar(255)` | Category name |
| `description` | `text` | Category description |

 4. **Orders**

| Column Name | Data Type | Description |
| --- | --- | --- |
| `order_id` | `int` | Unique order identifier (Primary Key) |
| `customer_id` | `int` | Foreign key referencing the `Customers` table |
| `order_date` | `date` | Order date |
| `total` | `decimal(10, 2)` | Order total |

 5. **Order Items**

| Column Name | Data Type | Description |
| --- | --- | --- |
| `order_item_id` | `int` | Unique order item identifier (Primary Key) |
| `order_id` | `int` | Foreign key referencing the `Orders` table |
| `product_id` | `int` | Foreign key referencing the `Products` table |
| `quantity` | `int` | Quantity of the product ordered |
| `unit_price` | `decimal(10, 2)` | Unit price of the product |

 6. **Inventory**

| Column Name | Data Type | Description |
| --- | --- | --- |
| `product_id` | `int` | Foreign key referencing the `Products` table |
| `quantity` | `int` | Current quantity in stock |
| `warehouse_id` | `int` | Foreign key referencing the `Warehouses` table |

 7. **Warehouses**

| Column Name | Data Type | Description |
| --- | --- | --- |
| `warehouse_id` | `int` | Unique warehouse identifier (Primary Key) |
| `name` | `varchar(255)` | Warehouse name |
| `address` | `text` | Warehouse address |

 8. **Sales**

| Column Name | Data Type | Description |
| --- | --- | --- |
| `sale_id` | `int` | Unique sale identifier (Primary Key) |
| `order_id` | `int` | Foreign key referencing the `Orders` table |
| `product_id` | `int` | Foreign key referencing the `Products` table |
| `quantity` | `int` | Quantity sold |
| `revenue` | `decimal(10, 2)` | Revenue generated from the sale |

 Relationships

* A customer can place many orders (one-to-many).
* An order is associated with one customer (many-to-one).
* An order can have many order items (one-to-many).
* An order item is associated with one order (many-to-one).
* A product can be part of many order items (one-to-many).
* An order item is associated with one product (many-to-one).
* A product can have many sales (one-to-many).
* A sale is associated with one product (many-to-one).
* A warehouse can store many products (one-to-many).
* A product can be stored in many warehouses (many-to-many).

 Indexes

* Create indexes on the following columns:
        + `Customers`: `name`, `email`, `phone`
        + `Products`: `name`, `description`, `price`
        + `Orders`: `order_date`, `total`
        + `Order Items`: `quantity`, `unit_price`
        + `Inventory`: `quantity`
        + `Sales`: `quantity`, `revenue`

 Partitioning

* Partition the `Orders` table by date (e.g., monthly).
* Partition the `Sales` table by date (e.g., monthly)."""

In [None]:
from langchain_groq import ChatGroq
groq_api_key = "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"
llm = ChatGroq(groq_api_key=groq_api_key, model_name='llama-3.3-70b-versatile')
response = llm.invoke("hello")


AIMessage(content="Hello. It's nice to meet you. Is there something I can help you with or would you like to chat?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 36, 'total_tokens': 61, 'completion_time': 0.090909091, 'prompt_time': 0.001931066, 'queue_time': 0.055213634, 'total_time': 0.092840157}, 'model_name': 'llama-3.3-70b-versatile', 'system_fingerprint': 'fp_41c250edc7', 'finish_reason': 'stop', 'logprobs': None}, id='run-291f33e9-b186-404e-85d1-faa1539713b3-0', usage_metadata={'input_tokens': 36, 'output_tokens': 25, 'total_tokens': 61})

In [None]:
from langchain_community.graphs import Neo4jGraph
from langchain.schema import Document
# Set your credentials
NEO4J_URI = "neo4j+s://791ec29e.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "sqJloNfV2JhYBEwkLVutmmv4kuKwOEnajD2qDkUgyBU"

# Initialize the graph with credentials
graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)
raw_document = [Document(page_content=raw_documents)]
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_document[:3])

from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_transformer = LLMGraphTransformer(llm=llm)
graph_documents = llm_transformer.convert_to_graph_documents(documents)

print(graph_documents)


[GraphDocument(nodes=[Node(id='Customers', type='Table', properties={}), Node(id='Products', type='Table', properties={}), Node(id='Categories', type='Table', properties={}), Node(id='Orders', type='Table', properties={}), Node(id='Order Items', type='Table', properties={})], relationships=[Relationship(source=Node(id='Customers', type='Table', properties={}), target=Node(id='Orders', type='Table', properties={}), type='HAS_ORDERS', properties={}), Relationship(source=Node(id='Orders', type='Table', properties={}), target=Node(id='Order Items', type='Table', properties={}), type='CONTAINS_ORDER_ITEMS', properties={}), Relationship(source=Node(id='Order Items', type='Table', properties={}), target=Node(id='Products', type='Table', properties={}), type='CONTAINS_PRODUCTS', properties={}), Relationship(source=Node(id='Products', type='Table', properties={}), target=Node(id='Categories', type='Table', properties={}), type='BELONGS_TO_CATEGORY', properties={})], source=Document(metadata={},

In [28]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [29]:
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

In [32]:
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
def showGraph(cypher: str = default_cypher):
    # Define Neo4j credentials directly
    NEO4J_URI = "neo4j+s://791ec29e.databases.neo4j.io"
    NEO4J_USERNAME = "neo4j"
    NEO4J_PASSWORD = "sqJloNfV2JhYBEwkLVutmmv4kuKwOEnajD2qDkUgyBU"

    # Create a Neo4j session to run queries
    driver = GraphDatabase.driver(
        uri=NEO4J_URI,
        auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
    )
    session = driver.session()
    
    # Execute Cypher query and visualize results
    widget = GraphWidget(graph=session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    display(widget)
    
    return widget

In [33]:

showGraph()

GraphWidget(layout=Layout(height='650px', width='100%'))

GraphWidget(layout=Layout(height='650px', width='100%'))