In [2]:
import os
from dotenv import load_dotenv
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

load_dotenv()

# Option A: get a single var
print("Google API Key:", os.getenv("GOOGLE_API_KEY"))
print("Chroma Telemetry:", os.getenv("CHROMA_TELEMETRY"))



Google API Key: AIzaSyD6gGfwQXUtiHKCLDi_MYy80wze001XdgA
Chroma Telemetry: FALSE


In [3]:


# Point at a small local file from your repo to test
# sample_file = Path.cwd() / "../aso-enrichment-tool" / "CLAUDE.md"
# docs = TextLoader(str(sample_file), autodetect_encoding=True).load()

# # Use a code-aware splitter if you want, e.g. Python
# splitter = RecursiveCharacterTextSplitter.from_language(
#     language=Language.MARKDOWN, chunk_size=1200, chunk_overlap=150
# )
# chunks = splitter.split_documents(docs)
# print(chunks[0].page_content[:1000])  # Print first 1000 characters of the first chunk




In [4]:
from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language

# 1️⃣ Path to repo folder
repo_folder = Path.cwd().parent / "aso-enrichment-tool"

# 2️⃣ DirectoryLoader with loader_args for TextLoader
loader = DirectoryLoader(
    path=str(repo_folder),
    glob="**/*.py",  # recursively load all Python files
    loader_cls=TextLoader,
    loader_kwargs={"autodetect_encoding": True}  # <-- pass here
)

# 3️⃣ Load documents
docs = loader.load()
print(f"Loaded {len(docs)} Python files as documents")

# 4️⃣ Split code into chunks
splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=800,
    chunk_overlap=100
)
chunks = splitter.split_documents(docs)
print(f"Created {len(chunks)} chunks in total")
print(chunks[0].page_content[:1000])  # preview first chunk


Loaded 75 Python files as documents
Created 1952 chunks in total
import os
import json
import logging
from logging.handlers import RotatingFileHandler
import pandas as pd
from langchain.schema import HumanMessage
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from datetime import datetime
from colorama import Fore, Style
import sys
from langchain_google_genai import ChatGoogleGenerativeAI
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from config import (
    AGENT_PROMPT_MD_GET_CONFIDENCE_SCORE,
)


In [None]:
emb = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

vectorstore = Chroma.from_documents(
    chunks,
    embedding=emb,
    collection_name="my_vector_store"
)

print(vectorstore._collection.count())
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
print("Vectorstore created and retriever ready.", retriever)


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


1952
Vectorstore created and retriever ready. tags=['Chroma', 'GoogleGenerativeAIEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x11ad30590> search_kwargs={'k': 10}


In [6]:


# ===============================
# 4. Gemini chat LLM
# ===============================
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])


In [11]:


from langchain_core.prompts import PromptTemplate

# prompt = hub.pull("rlm/rag-prompt")
prompt = (
	"You are an expert in code analysis and can help users understand the code snippets.\n"
	"You can also compare data across files to identify similarities and differences.\n"
	"Use the following pieces of retrieved context to answer the question. Use below format for interaction.\n"
	"Question: {question}\n"
	"Context: {context}\n"
	"Answer:"
)
prompt_template = PromptTemplate.from_template(prompt)

In [12]:

rag_chain = ({"context": retriever | format_docs, "question": RunnablePassthrough()}) | prompt_template | llm | StrOutputParser()

In [13]:
rag_chain.invoke("How many scripts are there in utils folder?")

Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


'Answer:\n\nThere are five scripts in the `utils` folder based on the provided code snippets:\n\n1. `format_script.py` (inferred from `from utils.format_script import CSVFormatProcessor`)\n2. `merge_formatted_csv.py` (inferred from `from utils.merge_formatted_csv import merge_formatted_file`)\n3. `find_missing_rows.py` (inferred from `from utils.find_missing_rows import main as find_missing_rows_main`)\n4. `update_category.py` (inferred from `from utils.update_category import main as update_category_main`)\n5. `merged_output_input_values.py` (inferred from `from utils.merged_output_input_values import main as merge_output_input_values`)\n6. `format_script_distribution.py` (inferred from `from utils.format_script_distribution import main as format_script_distribution`)'

In [22]:
import gradio as gr
from langchain.chains import ConversationalRetrievalChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores import Chroma

# Assume you already have:
# - retriever (from Chroma vectorstore)
# - llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

# Create the conversational chain
chatbot_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

chat_history = []
# Function Gradio calls for each message
def chat_fn(user_input, history):
    global chat_history

    # Pass tuples (user, assistant) to LangChain
    result = chatbot_chain.invoke({
        "question": user_input,
        "chat_history": chat_history
    })

    answer = result["answer"]

    # Append new interaction
    chat_history.append((user_input, answer))


    # Optionally include first source doc for reference
    sources = ""
    if result.get("source_documents"):
        sources = "\n\nSource snippet:\n" + result["source_documents"][0].page_content[:300]
    
    return answer + sources

# Build Gradio interface
iface = gr.ChatInterface(fn=chat_fn, type="messages",
    title="Gemini RAG Chatbot",
    description="Ask questions about your repo code/docs."
).launch()


* Running on local URL:  http://127.0.0.1:7866
* To create a public link, set `share=True` in `launch()`.
