In [2]:
import sys
import os

# Get the path to the project root dynamically
project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(project_root)

In [None]:
import os
from app.services.neo4j_service import Neo4jService
from dotenv import load_dotenv

load_dotenv()

service = Neo4jService()
service.reinit_graph(url=os.getenv("TEST_NEO4J_URI"),
                username=os.getenv("TEST_NEO4J_USERNAME"),
                password=os.getenv("TEST_NEO4J_PASSWORD"))
service.reinit_vector(url=os.getenv("TEST_NEO4J_URI"),
                username=os.getenv("TEST_NEO4J_USERNAMER"),
                password=os.getenv("TEST_NEO4J_PASSWORD"))

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

print(os.getenv("CHAT_GPT_MODEL","gpt-4o-mini"))
print(os.getenv("NEO4J_GPT_MODEL","gpt-4o-mini"))
print(os.getenv("LANGCHAIN_PROJECT",""))

In [None]:
# Collect file paths
files = [
    "./dataset/b0256024-4f01-4566-97e7-cd91b0500f47.pdf",
]

# Open files and pass to the method
doc = service.read_PDFs_and_create_documents(
    [open(file, "rb") for file in files],
    files
)

In [None]:
len(doc)

In [None]:
from langchain.schema import Document
from typing import List

def merge_documents(docs: List[Document]) -> Document:
    combined_text = "\n\n".join([doc.page_content for doc in docs])
    merged_doc = Document(
        metadata={'source': 'merged_documents'},
        page_content=combined_text
    )
    return merged_doc

merged_doc = merge_documents(doc)

In [None]:
merge_split = await service.split_documents_into_chunks([merged_doc])
len(merge_split)

In [None]:
import json

with open("merged_split.json", "w") as f:
    f.write(json.dumps([dict(x) for x in merge_split], indent=4))

In [None]:
import json
from langchain.schema import Document

with open("merged_split.json", "r") as f:
    merge_split = json.load(f)
    
merge_split = [Document(**x) for x in merge_split]

len(merge_split)

1111

In [None]:
merge_split

In [None]:
merged_split_translate = await service.translate_documents_with_openai(merge_split)

In [None]:
import json

with open("merged_split_translate.json", "w") as f:
    f.write(json.dumps([dict(x) for x in merged_split_translate], indent=4))

In [None]:
import json
from langchain.schema import Document

with open("merged_split_translate.json", "r") as f:
    merged_split_translate = json.load(f)
    
merged_split_translate = [Document(**x) for x in merged_split_translate]

In [None]:
merged_split_translate

In [None]:
# make a data into 10 chunks
print(len(merged_split_translate))
number_of_chunks = len(merged_split_translate)
number_of_target_chunks = 10

number_merge_chunks = number_of_chunks // number_of_target_chunks if number_of_chunks % number_of_target_chunks == 0 else number_of_chunks // number_of_target_chunks + 1

new_chunks = []
for i in range(number_of_target_chunks):
    start = i * number_merge_chunks
    end = (i + 1) * number_merge_chunks
    new_chunks.append(merged_split_translate[start:end])
    
len(new_chunks)

In [None]:
a = 0
for i in range(len(new_chunks)):
    a += len(new_chunks[i])
    print(len(new_chunks[i]))
print(a)

In [None]:
chunks = {
    f"step_{i}": [doc.model_dump() for doc in new_chunks[i]] for i in range(len(new_chunks))
}

with open("new_chunks.json", "w") as f:
    f.write(json.dumps(chunks, indent=4))

In [None]:
import json
from langchain.schema import Document

with open("new_chunks.json", "r") as f:
    chunks = json.load(f)
    
new_chunks = {k: [Document(**x) for x in v] for k, v in chunks.items()}

new_chunks

In [None]:
import os
import tiktoken
from dotenv import load_dotenv
load_dotenv()

encoding = tiktoken.encoding_for_model(os.getenv("NEO4J_GPT_MODEL","gpt-4o-mini"))

chunk_token = {}

for k, v in new_chunks.items():
    tokens = 0
    for doc in v:
        tokens += len(encoding.encode(doc.page_content))
    chunk_token[k] = tokens
    
chunk_token, min(chunk_token.values()), max(chunk_token.values())

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

print(os.getenv("LANGCHAIN_PROJECT",""))
print(os.getenv("LANGCHAIN_API_KEY",""))

In [None]:
all_graph_doc = []

for step, chunk in new_chunks.items():
    list_graph_doc = await service.convert_documents_to_graph(chunk)
    all_graph_doc.append(list_graph_doc)

In [None]:
graph_doc = {
    f"step_{i}": [doc.model_dump() for doc in all_graph_doc[i]] for i in range(len(all_graph_doc))
}

with open("graph_doc.json", "w") as f:
    f.write(json.dumps(graph_doc, indent=4))

In [None]:
import json
from langchain_community.graphs.graph_document import GraphDocument

with open("graph_doc.json", "r") as f:
    graph_doc = json.load(f)
    
    
graph_doc = {k: [GraphDocument(**x) for x in v] for k, v in graph_doc.items()}
    
graph_doc

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

print(os.getenv("LANGCHAIN_PROJECT",""))

In [None]:

query = "MATCH (n) DETACH DELETE n"
service.graph.query(query)
service.graph.query("CALL apoc.schema.assert({}, {})")
Question_list = [
    "What role does the Provincial Electricity Authority play in renewable energy projects?",
    "As a Renewable Energy Project Manager, how does the T-VER program ensure additionality in renewable energy projects?",
    "How does the Thailand Greenhouse Gas Management Organization contribute to the development of renewable energy projects in Thailand?",
    "What are the key conditions and applicability criteria for renewable energy projects under the T-VER program in Thailand?"
]

for step in graph_doc.keys():
    current_chunk = new_chunks[step]
    current_graph = graph_doc[step]

    # Add Document and Graph to the Neo4j
    service.add_graph_documents_to_neo4j(current_graph)
    await service.store_vector_embeddings(current_chunk)
        
    # Ask Questions
    for i in range(len(Question_list)):
        answer = await service.get_output(Question_list[i])

In [None]:
from langsmith import Client

client = Client()

In [None]:
from datetime import datetime, timedelta

start_time = datetime.now() - timedelta(days=7)

runs = list(
    client.list_runs(
        project_name="test-scale",
        run_type="llm",
        start_time=start_time,
    )
)

In [None]:
import pandas as pd

df = pd.DataFrame(
    [
        {
            "name": run.name,
            "model": run.extra["invocation_params"][
                "model"
            ],  # The parameters used when invoking the model are nested in the extra info
            **run.inputs,
            **(run.outputs or {}),
            "error": run.error,
            "latency": (run.end_time - run.start_time).total_seconds()
            if run.end_time
            else None,  # Pending runs have no end time
            "prompt_tokens": run.prompt_tokens,
            "completion_tokens": run.completion_tokens,
            "total_tokens": run.total_tokens,
        }
        for run in runs
    ],
    index=[run.id for run in runs],
)

df.head(5)

In [None]:
df.to_csv("Run.csv")

In [None]:
import pandas as pd

df = pd.read_csv("Run.csv").rename(columns={"Unnamed: 0": "id"}).set_index("id")
df.head()

In [None]:
df.values.shape

In [None]:
df_values = df.values[::-1]

all_chunk = list(range(10))

GraphCypherQa_Token = [sum(x)/5 for x in df_values[::2, -1].reshape(10, 5)]
LLMChain_Token = [sum(x)/5 for x in df_values[1::2, -1].reshape(10, 5)] 

GraphCypherQa_Latency = [sum(x)/5 for x in df_values[::2, -4].reshape(10, 5)]
LLMChain_Latency = [sum(x)/5 for x in df_values[1::2, -4].reshape(10, 5)]

print(GraphCypherQa_Token)
print(LLMChain_Token)
print(GraphCypherQa_Latency)
print(LLMChain_Latency)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Reverse the DataFrame values
df_values = df.values[:50]

# Select every other row for GraphCypherQa and LLMChain
graph_cypher_qa_values = df_values[::2, -1]  # Adjust column index as needed
llm_chain_values = df_values[1::2, -1]       # Adjust column index as needed

# Set your desired chunk size
chunk_size = 5

# Calculate the number of full chunks for each array
num_chunks_gcq = len(graph_cypher_qa_values) // chunk_size
num_chunks_llm = len(llm_chain_values) // chunk_size

# Trim the arrays to make their lengths divisible by the chunk size
graph_cypher_qa_values = graph_cypher_qa_values[:num_chunks_gcq * chunk_size]
llm_chain_values = llm_chain_values[:num_chunks_llm * chunk_size]

# Reshape the arrays
graph_cypher_qa_chunks = graph_cypher_qa_values.reshape(num_chunks_gcq, chunk_size)
llm_chain_chunks = llm_chain_values.reshape(num_chunks_llm, chunk_size)

# Calculate the average over each chunk
GraphCypherQa_Token = graph_cypher_qa_chunks.mean(axis=1)
LLMChain_Token = llm_chain_chunks.mean(axis=1)

# Repeat the process for latency or any other metric
# Assuming latency is at column index -4
graph_cypher_qa_latency_values = df_values[::2, -4]
llm_chain_latency_values = df_values[1::2, -4]

# Calculate number of chunks for latency data
num_chunks_gcq_latency = len(graph_cypher_qa_latency_values) // chunk_size
num_chunks_llm_latency = len(llm_chain_latency_values) // chunk_size

# Trim the latency arrays
graph_cypher_qa_latency_values = graph_cypher_qa_latency_values[:num_chunks_gcq_latency * chunk_size]
llm_chain_latency_values = llm_chain_latency_values[:num_chunks_llm_latency * chunk_size]

# Reshape the latency arrays
graph_cypher_qa_latency_chunks = graph_cypher_qa_latency_values.reshape(num_chunks_gcq_latency, chunk_size)
llm_chain_latency_chunks = llm_chain_latency_values.reshape(num_chunks_llm_latency, chunk_size)

# Calculate the average latency over each chunk
GraphCypherQa_Latency = graph_cypher_qa_latency_chunks.mean(axis=1)
LLMChain_Latency = llm_chain_latency_chunks.mean(axis=1)

# Prepare x-axis values based on the number of chunks
all_chunk_gcq = list(range(num_chunks_gcq))
all_chunk_llm = list(range(num_chunks_llm))

# Plotting the Token averages
plt.plot(all_chunk_gcq, GraphCypherQa_Token, label="GraphCypherQa")
plt.plot(all_chunk_llm, LLMChain_Token, label="LLMChain")
plt.legend()
plt.title("Average Token per Chunk")
plt.xlabel("#Chunk")
plt.ylabel("Token")
plt.show()


In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

# Data
all_chunk = np.arange(1, 11)
GraphCypherQa_Token = np.array([6669.2, 11202.4, 16648.6, 24328.0, 32995.6, 40034.2, 46521.0, 53022.0, 61487.2, 74885.2])

# Reshape all_chunk to a 2D array
x = all_chunk.reshape(-1, 1)

# Model training
model = LinearRegression()
model.fit(x, GraphCypherQa_Token)

# Define the upper limit for Token
upper_limit_token = 128000  # You can adjust this value as needed

# Solve for chunk where Token reaches the upper limit
upper_limit_chunk = (upper_limit_token - model.intercept_) / model.coef_[0]

# Display the result
print(f"The chunk where the token value reaches {upper_limit_token} is approximately {upper_limit_chunk:.2f}.")

In [None]:
plt.plot(all_chunk_gcq, GraphCypherQa_Latency, label="GraphCypherQa")
plt.plot(all_chunk_llm, LLMChain_Latency, label="LLMChain")
plt.legend()
plt.title("Average Latency per Chunk")
plt.xlabel("#Chunk")
plt.ylabel("Latency (s)")