In [1]:
import os
import argparse
import logging
import pandas as pd
import scanpy as sc
import sys
from dotenv import load_dotenv

from mdvtools.mdvproject import MDVProject
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import Language
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import langchain_experimental.agents.agent_toolkits.pandas.base as lp

# packages for custom langchain agent
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.tools.python.tool import PythonAstREPLTool
from langchain.agents import create_openai_functions_agent, AgentExecutor

from mdvtools.llm.local_files_utils import crawl_local_repo, extract_python_code_from_py, extract_python_code_from_ipynb
from mdvtools.llm.templates import prompt_data, get_createproject_prompt_RAG
from mdvtools.llm.code_manipulation import prepare_code
from mdvtools.llm.code_execution import execute_code

# packages for history
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI  # Use appropriate model
from langchain.schema import HumanMessage


from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.tools.python.tool import PythonAstREPLTool
from langchain.agents import create_openai_functions_agent, AgentExecutor


from langchain.chains import LLMChain


def create_custom_pandas_agent(llm, dfs: dict, prompt_data, verbose=False):
    """
    Creates a LangChain agent that can interact with Pandas DataFrames using a Python REPL tool.
    """
    
    # Step 1: Initialize Memory with Chat History
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    
    # Step 2: Create the Python REPL Tool
    python_tool = PythonAstREPLTool()
    if python_tool.globals is None:
        python_tool.globals = {}
    
    python_tool.globals.update(dfs)
    python_tool.globals["list_globals"] = lambda: list(python_tool.globals.keys())
    
    # Step 3: Define Contextualization Chain
    contextualize_q_system_prompt = """Given a chat history and the latest user question \
    which might reference context in the chat history, formulate a standalone question \
    which can be understood without the chat history. Do NOT answer the question, \
    just reformulate it if needed and otherwise return it as is."""
    
    contextualize_prompt = ChatPromptTemplate.from_messages([
        ("system", contextualize_q_system_prompt),
        ("human", "Chat History:\n{chat_history}\n\nUser Question:\n{input}"),
    ])
    
    contextualize_chain = LLMChain(llm=llm, prompt=contextualize_prompt, memory=memory)
    
    # Step 4: Define the Agent Prompt
    prompt_data = f"""You have access to the following Pandas DataFrames: 
    {', '.join(dfs.keys())}. These are preloaded, so do not redefine them.
    If you need to check their structure, use `df.info()` or `df.head()`.
    Before running any code, check available variables using `list_globals()`.""" + prompt_data
    
    prompt = ChatPromptTemplate.from_messages([
        ("system", prompt_data),
        ("human", "{input}"),
        ("ai", "{agent_scratchpad}"),
    ])
    
    # Step 5: Create the Pandas Agent
    agent = create_openai_functions_agent(llm, [python_tool], prompt)
    
    # Step 6: Wrap in an Agent Executor
    agent_executor = AgentExecutor(agent=agent, tools=[python_tool], memory=memory, verbose=verbose)
    
    # Step 7: Wrapper Function to Use Contextualization and Preserve Memory
    def agent_with_contextualization(question):
        standalone_question = contextualize_chain.run(input=question)
        response = agent_executor.invoke({"input": standalone_question})
        memory.save_context({"input": question}, {"output": response.get("output", str(response))})
        return response
    
    return agent_with_contextualization

In [7]:
# Load environment variables
load_dotenv()

# Define paths
project_path = os.path.expanduser("../../../mdv/automation9/")
dataset_path = "../../../mdv/automation9/ilc_viz_ready_revised.h5ad"

# Load dataset
adata = sc.read_h5ad(dataset_path)
cells_df = pd.DataFrame(adata.obs)
genes_df = pd.DataFrame(adata.var).reset_index()

# Initialize project and add datasources
project = MDVProject(project_path, delete_existing=False)
project.add_datasource("datasource_name", cells_df)
project.add_datasource("datasource_name2", genes_df)

# Retrieve data sources
datasource_names = [ds['name'] for ds in project.datasources[:2]]
df_list = [project.get_datasource_as_dataframe(ds['name']) for ds in project.datasources[:2]]

# Initialize LLMs
code_llm = ChatOpenAI(temperature=0.1, model="gpt-4o")
dataframe_llm = ChatOpenAI(temperature=0.1, model="gpt-4o")

# Crawl repository for code
code_files_urls = crawl_local_repo()
code_strings = [Document(page_content=extract_python_code_from_py(file), metadata={"url": file})
                if file.endswith(".py") else
                Document(page_content=extract_python_code_from_ipynb(file), metadata={"url": file})
                for file in code_files_urls if file.endswith(('.py', '.ipynb'))]

# Process code for RAG
text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=20000, chunk_overlap=2000
)
texts = text_splitter.split_documents(code_strings)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
db = FAISS.from_documents(texts, embeddings)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Create agent
agent_executor = create_custom_pandas_agent(
    dataframe_llm, {"df1": df_list[0], "df2": df_list[1]}, prompt_data, verbose=True
)

# Setup RAG prompts
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages([
    ("system", contextualize_q_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

history_aware_retriever = create_history_aware_retriever(code_llm, retriever, contextualize_q_prompt)

# Question loop with sequential processing
chat_history = []
questions = ["Can you plot a scatter plot?", "Can you change the params?"]

for question in questions:
    # Agent processing
    response = agent_executor(question)#{"input": question})
    print(f"Agent Response: {response['output']}")
    
    # Update RAG prompt with the latest agent response
    prompt_RAG = get_createproject_prompt_RAG(project, dataset_path, datasource_names[0], response['output'])
    
    qa_prompt = ChatPromptTemplate.from_messages([
        ("system", prompt_RAG),
        MessagesPlaceholder("chat_history"),
        ("human", "{context}\n\n{input}\n\n{question}"),
    ])
    
    question_answer_chain = create_stuff_documents_chain(code_llm, qa_prompt)
    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
    
    # RAG processing
    ai_msg = rag_chain.invoke({"chat_history": chat_history, "input": question, "question": question})
    chat_history.extend([HumanMessage(content=question), ai_msg["answer"]])
    
    print(f"RAG Response: {ai_msg['answer']}")


starting add_datasource
is ds None? None
got passed the ds check
created h5 group without error
- adding column 'sample_id' to datasource 'datasource_name'
- adding column 'doublet_scores' to datasource 'datasource_name'
- adding column 'predicted_doublets' to datasource 'datasource_name'
- adding column 'n_genes_by_counts' to datasource 'datasource_name'
- adding column 'total_counts' to datasource 'datasource_name'
- adding column 'total_counts_mt' to datasource 'datasource_name'
- adding column 'pct_counts_mt' to datasource 'datasource_name'
- adding column 'total_counts_rp' to datasource 'datasource_name'
- adding column 'pct_counts_rp' to datasource 'datasource_name'
- adding column 'total_counts_hb' to datasource 'datasource_name'
- adding column 'pct_counts_hb' to datasource 'datasource_name'
- adding column 'total_counts_ig' to datasource 'datasource_name'
- adding column 'pct_counts_ig' to datasource 'datasource_name'
- adding column 'S_score' to datasource 'datasource_name'
-

  code_llm = ChatOpenAI(temperature=0.1, model="gpt-4o")
  contextualize_chain = LLMChain(llm=llm, prompt=contextualize_prompt, memory=memory)
  standalone_question = contextualize_chain.run(input=question)
Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df1.info()'}`


[0m[36;1m[1;3m<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3680 entries, 0 to 3679
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   sample_id           3680 non-null   object 
 1   doublet_scores      3680 non-null   float32
 2   predicted_doublets  3680 non-null   object 
 3   n_genes_by_counts   3680 non-null   float32
 4   total_counts        3680 non-null   float32
 5   total_counts_mt     3680 non-null   float32
 6   pct_counts_mt       3680 non-null   float32
 7   total_counts_rp     3680 non-null   float32
 8   pct_counts_rp       3680 non-null   float32
 9   total_counts_hb     3680 non-null   float32
 10  pct_counts_hb       3680 non-null   float32
 11  total_counts_ig     3680 non-null   float32
 12  pct_counts_ig       3680 non-null   float32
 13  S_score             3680 non-null   float32
 14  G2M_score   

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'list_globals()'}`


[0m[36;1m[1;3m['df1', 'df2', 'list_globals', '__builtins__'][0m[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df1.info()'}`


[0m[36;1m[1;3m<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3680 entries, 0 to 3679
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   sample_id           3680 non-null   object 
 1   doublet_scores      3680 non-null   float32
 2   predicted_doublets  3680 non-null   object 
 3   n_genes_by_counts   3680 non-null   float32
 4   total_counts        3680 non-null   float32
 5   total_counts_mt     3680 non-null   float32
 6   pct_counts_mt       3680 non-null   float32
 7   total_counts_rp     3680 non-null   float32
 8   pct_counts_rp       3680 non-null   float32
 9   total_counts_hb     3680 non-null   float32
 10  pct_counts_hb       3680 non-null   float32
 11  total_counts_