In [None]:


%env LANGCHAIN_TRACING_V2=true
%env LANGCHAIN_API_KEY=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

%env OPENAI_API_KEY=sk-

In [None]:
import os
import json
import logging

from typing import Dict, List, TypedDict, Any

from langchain import hub
from langchain_core.documents import Document
from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.language_models.chat_models import BaseChatModel


JSON_SCHEMA_DIR = "../resources"
JSON_SCHEMA_FILES = [
    {
        "name": "Genomic Data Commons(GDC)",
        "filename": "better_gdc_schema.json",
        "jq_schema": ".",
    },
]


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


class RAG:

    def __init__(
        self, embeddings_model: str = "sentence-transformers/all-MiniLM-L6-v2"
    ) -> None:
        self.schemas = self.load_json_schemas()
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200
        )
        embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
        self.vector_store = InMemoryVectorStore(embeddings)

        # Define prompt for question-answering
        self.prompt = hub.pull("rlm/rag-prompt")

    def load_json_schemas(self) -> List[Document]:
        schemas = []
        for schema in JSON_SCHEMA_FILES:

            schema_path = os.path.join(JSON_SCHEMA_DIR, schema["filename"])
            loader = JSONLoader(
                file_path=schema_path,
                jq_schema=schema["jq_schema"],
                text_content=False,
            )
            schema_data = loader.load()
            schemas.extend(schema_data)
        return schemas

    def init_rag(self):
        if self.schemas is None or len(self.schemas) == 0:
            raise ValueError("No schema data found.")
        all_splits = self.text_splitter.split_documents(self.schemas)

        # Index chucks
        _ = self.vector_store.add_documents(all_splits)

    def retrieve(self, state: State):
        retrieved_docs = self.vector_store.similarity_search(state["question"])
        return {"context": retrieved_docs}

    def generate(self, state: State, llm: BaseChatModel):
        docs_content = "\n\n".join([doc.page_content for doc in state["context"]])
        messages = self.prompt.invoke(
            {
                "question": state["question"],
                "context": docs_content,
            }
        )

        response = llm.invoke(messages)
        return response

In [None]:
loader = JSONLoader(
    file_path="../resources/better_gdc_schema.json",
    jq_schema=".",
    text_content=False,
)
loader.load()

In [None]:
rag = RAG()

rag.init_rag()

In [None]:
rag.retrieve(State(question="What is AJCC?", context=[], answer=""))

In [None]:
from git import Repo  # pip install gitpython

git_url = "https://github.com/NCI-GDC/gdcdictionary.git"

Repo.clone_from(git_url, "gdc/")

In [None]:
import yaml
import os

gdc_dictionary_path = "gdc/src/gdcdictionary/schemas"
files = os.listdir(gdc_dictionary_path)

yamls = [fi for fi in files if fi.endswith(".yaml")]

with open(os.path.join(gdc_dictionary_path, "diagnosis.yaml"), "r") as file:
    prime_service = yaml.safe_load(file)

prime_service

In [None]:
import json


def parse_gdc_yaml(file_name):
    with open(file_name, "r") as f:
        raw = json.load(f)

    if raw is None:
        return

    attrs_ret = []

    for subschema, subschema_obj in raw.items():

        properties = subschema_obj["properties"]
        for attr_name, attr_obj in properties.items():
            if attr_name == "$ref":
                continue
            attr_json = {
                "column_name": attr_name,
                "type": "",
                "description": extract_description(attr_obj),
            }

            if "oneOf" in attr_obj:
                for subtype_obj in attr_obj["oneOf"]:
                    if "enum" in subtype_obj:
                        if "enum" not in attr_json:
                            attr_json["enum"] = []
                        attr_json["type"] = "enum"
                        attr_json["enum"].extend(subtype_obj["enum"])
                    elif subtype_obj["type"] != "null":
                        print(f"Hahahahahah ----- {subtype_obj}")
                        type, type_obj = extract_level_one_type(subtype_obj)
                        attr_json["type"] = type
                        for k, v in type_obj.items():
                            attr_json[k] = v
            elif "anyOf" in attr_obj:
                for subtype_obj in attr_obj["anyOf"]:
                    if subtype_obj["type"] != "array":
                        continue
                    type, type_obj = extract_level_one_type(subtype_obj)
                    attr_json["type"] = type
                    for k, v in type_obj.items():
                        attr_json[k] = v
            else:
                type, type_obj = extract_level_one_type(attr_obj)
                if type is None:
                    continue
                else:
                    attr_json["type"] = type
                    for k, v in type_obj.items():
                        attr_json[k] = v
            attrs_ret.append(attr_json)
    return attrs_ret


def extract_level_one_type(attr_obj):
    if "enum" in attr_obj:
        enum_obj = {"enum": attr_obj["enum"]}
        return ("enum", enum_obj)
    if "type" not in attr_obj:
        return (None, {})
    if attr_obj["type"] == "string":
        string_obj = {}
        if "pattern" in attr_obj:
            string_obj["pattern"] = attr_obj["pattern"]
        if "format" in attr_obj:
            string_obj["format"] = attr_obj["format"]
        return ("string", string_obj)
    elif attr_obj["type"] == "integer":
        number_obj = {}
        if "minimum" in attr_obj:
            number_obj["minimum"] = attr_obj["minimum"]
        if "minimum" in attr_obj:
            number_obj["minimum"] = attr_obj["minimum"]
        return ("integer", number_obj)
    elif attr_obj["type"] == "number":
        number_obj = {}
        if "minimum" in attr_obj:
            number_obj["minimum"] = attr_obj["minimum"]
        if "minimum" in attr_obj:
            number_obj["minimum"] = attr_obj["minimum"]
        return ("number", number_obj)
    elif attr_obj["type"] == "boolean":
        return ("boolean", {})
    elif attr_obj["type"] == "array":
        array_obj = {"items": attr_obj["items"]}
        return ("array", array_obj)


def extract_description(attr_obj):
    if "description" in attr_obj:
        return attr_obj["description"]
    elif "common" in attr_obj:
        if "description" in attr_obj["common"]:
            return attr_obj["common"]["description"]
    elif "enumDef" in attr_obj:
        discriptions = {}
        for k, v in attr_obj["enumDef"].items():
            discriptions[k] = v["description"]

    return ""


parsed_gdc_json = parse_gdc_yaml("../resources/gdc_schema.json")

with open("../resources/better_gdc_schema.json", "w") as f:
    json.dump(parsed_gdc_json, f, indent=4)