In [1]:
%pwd

'e:\\LangChain project\\DroneScripts-RAG\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'e:\\LangChain project\\DroneScripts-RAG'

In [4]:
from langchain.document_loaders import PythonLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [43]:
file_path = "Data/client.py"

In [44]:
loader = PythonLoader(file_path)

documents = loader.load()
print(documents)



In [45]:
len(documents[0].page_content)

27175

In [46]:
documents[0].page_content



In [47]:
import pprint

# Basic chunking

In [48]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=text_split(documents)

In [49]:
pprint.pprint(text_chunks[60].page_content)

('def moveOnPathAsync(self, path, velocity, timeout_sec = 3e+38, drivetrain = '
 'DrivetrainType.MaxDegreeOfFreedom, yaw_mode = YawMode(),\n'
 "        lookahead = -1, adaptive_lookahead = 1, vehicle_name = ''):\n"
 "        return self.client.call_async('moveOnPath', path, velocity, "
 'timeout_sec, drivetrain, yaw_mode, lookahead, adaptive_lookahead, '
 'vehicle_name)')


# ATS chunking

In [110]:
import ast
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid

def extract_segments(source_code: str, node: ast.AST, metadata: dict) -> list[dict]:
    """
    Recursive extract code segments từ node, thêm metadata.
    """
    segments = []
    
    if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
        code_segment = ast.get_source_segment(source_code, node)
        if code_segment:
            segments.append({
                'code': code_segment,
                'metadata': {
                    **metadata,
                    'chunk_type': 'function',
                    'name': node.name
                }
            })
    
    # Recursive vào body của class
    if isinstance(node, ast.ClassDef):
        code_segment = ast.get_source_segment(source_code, node)
        # Chỉ thêm class header (không methods) nếu ngắn
        if code_segment and len(code_segment) <= 200:  # Giới hạn nhỏ để tránh class lớn
            segments.append({
                'code': code_segment.split('\n\n')[0],  # Chỉ lấy header
                'metadata': {
                    **metadata,
                    'chunk_type': 'class',
                    'name': node.name
                }
            })
        for subnode in node.body:
            segments.extend(extract_segments(source_code, subnode, metadata))
    
    return segments

def chunk_python_code(doc: Document, max_chunk_len: int = 500) -> list[Document]:
    source_code = doc.page_content
    try:
        tree = ast.parse(source_code)
    except SyntaxError:
        # Fallback nếu code lỗi
        splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_len, chunk_overlap=50, separators=["\n\n", "\n", " ", ""])
        return splitter.split_documents([doc])
    
    chunks = []
    
    # Chunk imports riêng
    imports = [ast.get_source_segment(source_code, n) for n in tree.body if isinstance(n, (ast.Import, ast.ImportFrom))]
    if imports:
        import_str = "\n".join(filter(None, imports))
        chunks.append(Document(
            page_content=import_str,
            metadata={**doc.metadata, 'chunk_type': 'imports'},
            artifact_id=str(uuid.uuid4())
        ))
    
    # Extract segments recursive
    all_segments = []
    for node in tree.body:
        if not isinstance(node, (ast.Import, ast.ImportFrom)):
            all_segments.extend(extract_segments(source_code, node, doc.metadata))
    
    # Tích lũy segments vào chunks
    current_chunk = []
    current_len = 0
    for segment in all_segments:
        segment_len = len(segment['code'])
        
        # Flush nếu vượt max
        if current_len + segment_len > max_chunk_len and current_chunk:
            chunk_content = "\n\n".join(c['code'] for c in current_chunk)
            chunks.append(Document(
                page_content=chunk_content,
                metadata={**current_chunk[0]['metadata'], 'chunk_type': 'code_group'},
                artifact_id=str(uuid.uuid4())
            ))
            current_chunk = []
            current_len = 0
        
        current_chunk.append(segment)
        current_len += segment_len
        
        # Split nếu segment đơn lẻ quá dài
        if segment_len > max_chunk_len:
            splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_len, chunk_overlap=50, separators=["\n\n", "\n", " ", ""])
            sub_docs = splitter.split_text(segment['code'])
            for sub in sub_docs:
                chunks.append(Document(
                    page_content=sub,
                    metadata={**segment['metadata'], 'chunk_type': 'split_segment'},
                    artifact_id=str(uuid.uuid4())
                ))
            current_chunk = []
            current_len = 0
    
    # Flush cuối
    if current_chunk:
        chunk_content = "\n\n".join(c['code'] for c in current_chunk)
        chunks.append(Document(
            page_content=chunk_content,
            metadata={**current_chunk[0]['metadata'], 'chunk_type': 'code_group'},
            artifact_id=str(uuid.uuid4())
        ))
    
    return chunks

def chunk_all_documents(docs: list[Document], max_chunk_len: int = 1500) -> list[Document]:
    """Apply chunking cho toàn bộ list Document"""
    all_chunks = []
    for doc in docs:
        all_chunks.extend(chunk_python_code(doc, max_chunk_len))
    return all_chunks

In [111]:
chunks = chunk_all_documents(documents)

In [112]:
len(chunks)

22

In [132]:
chunks[7]

Document(metadata={'source': 'Data/client.py', 'chunk_type': 'code_group', 'name': 'simGetImage'}, page_content='def simGetImage(self, camera_name, image_type, vehicle_name = \'\', external = False):\n        """\n        Get a single image\n\n        Returns bytes of png format image which can be dumped into abinary file to create .png image\n        `string_to_uint8_array()` can be used to convert into Numpy unit8 array\n        See https://microsoft.github.io/AirSim/image_apis/ for details\n\n        Args:\n            camera_name (str): Name of the camera, for backwards compatibility, ID numbers such as 0,1,etc. can also be used\n            image_type (ImageType): Type of image required\n            vehicle_name (str, optional): Name of the vehicle with the camera\n            external (bool, optional): Whether the camera is an External Camera\n\n        Returns:\n            Binary string literal of compressed png image\n        """\n#todo : in future remove below, it\'s only for

In [118]:
print("Length of Chunks", len(chunks))

Length of Chunks 22


In [119]:
chunks[0].page_content

'from __future__ import print_function\nfrom .utils import *\nfrom .types import *\nimport msgpackrpc\nimport numpy as np\nimport msgpack\nimport time\nimport math\nimport logging'

In [63]:
from langchain.embeddings import HuggingFaceEmbeddings

#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='jinaai/jina-embeddings-v2-base-code')
    return embeddings

In [64]:
embeddings = download_hugging_face_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-base-code and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.

In [26]:
from langchain.embeddings import HuggingFaceEmbeddings
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='microsoft/codebert-base')
    return embeddings

embeddings2 = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='microsoft/codebert-base')
No sentence-transformers model found with name microsoft/codebert-base. Creating a new one with mean pooling.


In [120]:
#test the embeddings
query_result = embeddings2.embed_query("Hello world")
print("Length", len(query_result))

Length 768


In [29]:
from dotenv import load_dotenv
load_dotenv()

True

In [30]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [134]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "drone-rag-index-codebert"

pc.create_index(
    name=index_name,
    dimension=768, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

{
    "name": "drone-rag-index-codebert",
    "metric": "cosine",
    "host": "drone-rag-index-codebert-1uz26v7.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [135]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=chunks,
    index_name=index_name,
    embedding=embeddings2, 
)

In [136]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name="drone-rag-index-codebert",
    embedding=embeddings2
)

In [137]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x240852f3320>

In [138]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":15})

In [141]:
retrieved_docs = retriever.invoke("takeoff")

In [142]:
retrieved_docs

[Document(id='23ef557e-e8ec-491c-ae44-afd0530a143d', metadata={'chunk_type': 'imports', 'source': 'Data/client.py'}, page_content='from __future__ import print_function\nfrom .utils import *\nfrom .types import *\nimport msgpackrpc\nimport numpy as np\nimport msgpack\nimport time\nimport math\nimport logging'),
 Document(id='793b38d3-3680-4d47-8832-d3547600f392', metadata={'chunk_type': 'code_group', 'name': 'simGetImage', 'source': 'Data/client.py'}, page_content='def simGetImage(self, camera_name, image_type, vehicle_name = \'\', external = False):\n        """\n        Get a single image\n\n        Returns bytes of png format image which can be dumped into abinary file to create .png image\n        `string_to_uint8_array()` can be used to convert into Numpy unit8 array\n        See https://microsoft.github.io/AirSim/image_apis/ for details\n\n        Args:\n            camera_name (str): Name of the camera, for backwards compatibility, ID numbers such as 0,1,etc. can also be used\n 

In [143]:
from langchain.retrievers import EnsembleRetriever
from langchain.vectorstores import Pinecone
from langchain_community.retrievers import BM25Retriever

# Vector retriever
vector_retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 15}
)

# BM25 retriever 
bm25_retriever = BM25Retriever.from_documents(documents)  
bm25_retriever.k = 15

# Hybrid retriever
hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.7, 0.3]   # weights can be adjusted as needed
)

retrieved_docs = hybrid_retriever.invoke("takeoff")


In [144]:
retrieved_docs

[Document(id='23ef557e-e8ec-491c-ae44-afd0530a143d', metadata={'chunk_type': 'imports', 'source': 'Data/client.py'}, page_content='from __future__ import print_function\nfrom .utils import *\nfrom .types import *\nimport msgpackrpc\nimport numpy as np\nimport msgpack\nimport time\nimport math\nimport logging'),
 Document(id='793b38d3-3680-4d47-8832-d3547600f392', metadata={'chunk_type': 'code_group', 'name': 'simGetImage', 'source': 'Data/client.py'}, page_content='def simGetImage(self, camera_name, image_type, vehicle_name = \'\', external = False):\n        """\n        Get a single image\n\n        Returns bytes of png format image which can be dumped into abinary file to create .png image\n        `string_to_uint8_array()` can be used to convert into Numpy unit8 array\n        See https://microsoft.github.io/AirSim/image_apis/ for details\n\n        Args:\n            camera_name (str): Name of the camera, for backwards compatibility, ID numbers such as 0,1,etc. can also be used\n 

In [42]:
retrieved_docs[0].page_content

def get_context(retrieved_docs):
    context = ""
    for doc in retrieved_docs:
        context += doc.page_content + "\n"
    return context

s = get_context(retrieved_docs)
print(s)


def moveByRC(self, rcdata = RCData(), vehicle_name = ''):
        return self.client.call('moveByRC', rcdata, vehicle_name)

#low - level control API
    def moveByMotorPWMsAsync(self, front_right_pwm, rear_left_pwm, front_left_pwm, rear_right_pwm, duration, vehicle_name = ''):
        """
        - Directly control the motors using PWM values
Returns:
            msgpackrpc.future.Future: future. call .join() to wait for method to finish. Example: client.METHOD().join()
        """
        return self.client.call_async('moveByAngleRatesZ', roll_rate, -pitch_rate, -yaw_rate, z, duration, vehicle_name)
- This function should only be called if the default angle rate control PID gains need to be modified.
Returns:
            msgpackrpc.future.Future: future. call .join() to wait for method to finish. Example: client.METHOD().join()
        """
        return self.client.call_async('moveByRollPitchYawrateZ', roll, -pitch, -yaw_rate, z, duration, vehicle_name)
Returns:
            msgpackr

In [82]:
GEMINI_API_KEY=os.environ.get('GEMINI_API_KEY')
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [96]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-lite",
    google_api_key=GEMINI_API_KEY,
    temperature=0.2,
    max_output_tokens=2048
)

In [97]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a mission planner for an autonomous drone. "
    "Your task is to generate executable mission scripts in Python "
    "using only the provided drone actions. "
    "Always follow this template:\n\n"
    "from drone_actions import ...\n\n"
    "objects_list = [....]\n\n"
    "def mission():\n"
    "    # Step 0: ...\n"
    "    ...\n"
    "    # Step 1: ...\n"
    "   ...\n"
    "    # Final step: ...\n"
    "    ...\n"
    "    ...\n\n"
    "Constraints:\n"
    "- Use ONLY the available primitives.\n"
    "- Always structure code with numbered comments (# Step 1, # Step 2,...).\n"
    "- If mission is unclear, still generate a skeleton with TODO comments.\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [98]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain = (
    {"context": lambda x: retriever.invoke(x["input"]), "input": RunnablePassthrough() }
    | prompt
    | llm
    | StrOutputParser()
)

In [99]:
response = rag_chain.invoke({"input": "write a mission script to take off, fly to 10 meters, and land"})
print(response)

```python
from drone_actions import takeoff, moveByRollPitchYawThrottleAsync, land

objects_list = []

def mission():
    # Step 0: Arm the drone (implicitly done by takeoff)
    # Step 1: Take off
    takeoff(timeout_sec=5)
    # Step 2: Fly to 10 meters
    moveByRollPitchYawThrottleAsync(roll=0, pitch=0, yaw=0, throttle=0.5, duration=5) # Assuming throttle 0.5 roughly corresponds to climbing
    # Step 3: Land
    land()
```


In [100]:
response

'```python\nfrom drone_actions import takeoff, moveByRollPitchYawThrottleAsync, land\n\nobjects_list = []\n\ndef mission():\n    # Step 0: Arm the drone (implicitly done by takeoff)\n    # Step 1: Take off\n    takeoff(timeout_sec=5)\n    # Step 2: Fly to 10 meters\n    moveByRollPitchYawThrottleAsync(roll=0, pitch=0, yaw=0, throttle=0.5, duration=5) # Assuming throttle 0.5 roughly corresponds to climbing\n    # Step 3: Land\n    land()\n```'