![Img](https://app.theheadstarter.com/static/hs-logo-opengraph.png)

# Headstarter Codebase RAG Project

![Screenshot 2024-11-25 at 7 12 58 PM](https://github.com/user-attachments/assets/48dd9de1-b4d2-4318-8f52-85ec209d8ebc)

# Install Necessary Libraries

In [1]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers

Collecting pygithub
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.22-py3-none-any.whl.metadata (2.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.5-py3-none-any.whl.metadata (1.3 kB)
Collecting pynacl>=1.4.0 (from pygithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone
import numpy as np

# Clone a GitHub Repo locally

In [3]:
github_repo = "https://github.com/CoderAgent/SecureAgent"

In [5]:
def clone_repository(repo_url):
    """Clones a GitHub repository to a temporary directory.

    Args:
        repo_url: The URL of the GitHub repository.

    Returns:
        The path to the cloned repository.
    """

    repo_name = github_repo.split("/")[-1]
    repo_path = f"/content/{repo_name}"
    Repo.clone_from(repo_url, str(repo_path))
    return repo_path

In [6]:
path = clone_repository(github_repo)

# Define which types of files to parse and which files / folders to ignore

In [7]:
SUPPORTED_EXTENSIONS = {'.py', '.js', '.tsx', '.jsx', '.ipynb', '.java',
                         '.cpp', '.ts', '.go', '.rs', '.vue', '.swift', '.c', '.h'}

IGNORED_DIRS = {'node_modules', 'venv', 'env', 'dist', 'build', '.git',
                '__pycache__', '.next', '.vscode', 'vendor'}

In [9]:
def get_file_content(file_path, repo_path):
    """
    Get content of a single file.

    Args:
        file_path (str): Path to the file

    Returns:
        Optional[Dict[str, str]]: Dictionary with file name and content
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        rel_path = os.path.relpath(file_path, repo_path)

        return {
            "name": rel_path,
            "content": content
        }

    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None



def get_main_files_content(repo_path: str):
    """
    Get content of supported code files from the local repository.

    Args:
        repo_path: Path to the local repository

    Returns:
        List of dictionaries containing file names and contents
    """

    files_content = []

    try:

        for root, _, files in os.walk(repo_path):
            if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
                continue

            for file in files:
                file_path = os.path.join(root, file)
                if os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS:
                    file_content = get_file_content(file_path, repo_path)

                    if file_content:
                        files_content.append(file_content)

    except Exception as e:
        print(e)

    return files_content

In [10]:
files_content = get_main_files_content(path)

In [11]:
files_content

[{'name': 'src/app.ts',
  'content': 'import { Octokit } from "@octokit/rest";\nimport { createNodeMiddleware } from "@octokit/webhooks";\nimport { WebhookEventMap } from "@octokit/webhooks-definitions/schema";\nimport * as http from "http";\nimport { App } from "octokit";\nimport { Review } from "./constants";\nimport { env } from "./env";\nimport { processPullRequest } from "./review-agent";\nimport { applyReview } from "./reviews";\n\n// This creates a new instance of the Octokit App class.\nconst reviewApp = new App({\n  appId: env.GITHUB_APP_ID,\n  privateKey: env.GITHUB_PRIVATE_KEY,\n  webhooks: {\n    secret: env.GITHUB_WEBHOOK_SECRET,\n  },\n});\n\nconst getChangesPerFile = async (payload: WebhookEventMap["pull_request"]) => {\n  try {\n    const octokit = await reviewApp.getInstallationOctokit(\n      payload.installation.id\n    );\n    const { data: files } = await octokit.rest.pulls.listFiles({\n      owner: payload.repository.owner.login,\n      repo: payload.repository.na

# Embeddings

In [12]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

In [27]:
text = "I like running"

embedding = get_huggingface_embeddings(text)

In [28]:
embedding

array([-7.04788268e-02,  1.94333866e-02, -1.86341424e-02,  4.69544753e-02,
        3.61323655e-02,  5.14774658e-02, -9.41505358e-02,  1.51251704e-02,
       -2.15133047e-03, -1.39002576e-02,  1.19065330e-03,  9.69313737e-03,
        1.96244307e-02,  1.96997877e-02,  1.16340872e-02, -5.16801290e-02,
       -4.76255380e-02, -4.00664844e-03,  9.41014290e-03, -1.29923569e-02,
        8.50989018e-05,  7.24124210e-03, -8.99291970e-03, -6.52526096e-02,
       -1.28775183e-02, -1.80495866e-02,  3.14601250e-02, -4.64244150e-02,
        1.11613853e-03,  6.29375875e-02, -3.89754698e-02, -5.87115176e-02,
        2.55934969e-02, -5.96262142e-03,  1.46683931e-06,  1.16431098e-02,
       -2.85385642e-03,  2.02579331e-02,  2.59376634e-02, -1.00705689e-02,
        6.89758211e-02, -3.88326054e-03, -3.99554819e-02,  7.85138924e-03,
       -6.83246106e-02,  5.25738969e-02,  5.47029823e-02, -4.41773087e-02,
       -2.81809531e-02,  1.19776474e-02, -1.52425775e-02,  2.51070913e-02,
       -7.15806559e-02, -

In [29]:
text2 = "I like to run"

embedding2 = get_huggingface_embeddings(text2)

In [30]:
def cosine_similarity_embeddings(embedding1, embedding2):
  """Calculates the cosine similarity between two embeddings.

  Args:
    embedding1: The first embedding as a NumPy array.
    embedding2: The second embedding as a NumPy array.

  Returns:
    The cosine similarity between the two embeddings.
  """
  dot_product = np.dot(embedding1, embedding2)
  magnitude1 = np.linalg.norm(embedding1)
  magnitude2 = np.linalg.norm(embedding2)

  if magnitude1 == 0 or magnitude2 == 0:
    return 0

  return dot_product / (magnitude1 * magnitude2)

In [31]:
cosine_similarity = cosine_similarity_embeddings(embedding, embedding2)

In [32]:
cosine_similarity

np.float32(0.93396336)

# Setting up Pinecone
**1. Create an account on [Pinecone.io](https://app.pinecone.io/)**

**2. Create a new index called "codebase-rag" and set the dimensions to 768. Leave the rest of the settings as they are.**

![Screenshot 2024-11-24 at 10 58 50 PM](https://github.com/user-attachments/assets/f5fda046-4087-432a-a8c2-86e061005238)



**3. Create an API Key for Pinecone**

![Screenshot 2024-11-24 at 10 44 37 PM](https://github.com/user-attachments/assets/e7feacc6-2bd1-472a-82e5-659f65624a88)


**4. Store your Pinecone API Key within Google Colab's secrets section, and then enable access to it (see the blue checkmark)**

![Screenshot 2024-11-24 at 10 45 25 PM](https://github.com/user-attachments/assets/eaf73083-0b5f-4d17-9e0c-eab84f91b0bc)



In [33]:
# Set the PINECONE_API_KEY as an environment variable
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [34]:
vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())

  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())
  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())


In [37]:
# Insert the codebase embeddings into Pinecone

documents = []

for file in files_content:
    doc = Document(
        page_content=f"{file['name']}\n\n{file['content']}",
        metadata={"source": file['name']}

    )

    documents.append(doc)


vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace="https://github.com/CoderAgent/SecureAgent"
)



  embedding=HuggingFaceEmbeddings(),


# Perform RAG

1. Get your OpenRouter API Key [here](https://openrouter.ai/settings/keys)

2. Paste your OpenRouter Key into your Google Colab secrets, and make sure to enable permissions for it

![Image](https://github.com/user-attachments/assets/bd64c5aa-952e-4a1e-9ac0-01d8fe93aaa1)


In [38]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=userdata.get("OPENROUTER_API_KEY")
)

In [39]:
query = "How is the javascript parser used?"

In [40]:
query_embedding = get_huggingface_embeddings(query)

In [41]:
query_embedding

array([ 5.71991131e-02, -3.48081291e-02, -3.27215418e-02,  5.29010817e-02,
       -3.88054959e-02,  2.21033078e-02,  1.60938818e-02, -1.00735975e-02,
        3.04608904e-02, -6.25874922e-02,  2.71743089e-02,  3.67565900e-02,
        5.69119453e-02,  5.45354113e-02,  4.02487814e-02, -4.73744869e-02,
        3.59910820e-03,  6.65521948e-03,  1.47536704e-02,  3.57538760e-02,
        1.81228686e-02,  1.24748386e-02, -2.07926128e-02,  6.99328259e-02,
       -1.78905539e-02, -1.98267922e-02, -8.77424423e-03, -4.04769043e-03,
       -4.82026935e-02, -1.55368652e-02, -6.26485124e-02, -6.66373223e-03,
        1.43068153e-02, -4.92968187e-02,  1.30648721e-06, -2.02893210e-03,
       -4.47639599e-02,  2.07317546e-02, -2.80540297e-03,  1.37846796e-02,
        4.11506603e-03,  6.87661488e-03, -2.91272085e-02, -6.68382505e-03,
        2.94112526e-02, -4.13797721e-02,  3.90248671e-02, -5.73173016e-02,
        3.29415090e-02,  1.95522374e-03, -7.05714687e-04, -2.74959207e-02,
        8.47589783e-03,  

In [42]:
top_matches = pinecone_index.query(vector=query_embedding.tolist(),
                                   top_k=5,
                                   include_metadata=True,
                                   namespace="https://github.com/CoderAgent/SecureAgent")

In [43]:
top_matches

{'matches': [{'id': 'b9f222f1-c02e-4537-acc5-028de03e1b35',
              'metadata': {'source': 'src/context/language/javascript-parser.ts',
                           'text': 'src/context/language/javascript-parser.ts\n'
                                   '\n'
                                   'import { AbstractParser, EnclosingContext '
                                   '} from "../../constants";\n'
                                   'import * as parser from "@babel/parser";\n'
                                   'import traverse, { NodePath, Node } from '
                                   '"@babel/traverse";\n'
                                   '\n'
                                   'const processNode = (\n'
                                   '  path: NodePath<Node>,\n'
                                   '  lineStart: number,\n'
                                   '  lineEnd: number,\n'
                                   '  largestSize: number,\n'
                               

In [44]:
context = [item['metadata']['text'] for item in top_matches['matches']]

In [45]:
context

['src/context/language/javascript-parser.ts\n\nimport { AbstractParser, EnclosingContext } from "../../constants";\nimport * as parser from "@babel/parser";\nimport traverse, { NodePath, Node } from "@babel/traverse";\n\nconst processNode = (\n  path: NodePath<Node>,\n  lineStart: number,\n  lineEnd: number,\n  largestSize: number,\n  largestEnclosingContext: Node | null\n) => {\n  const { start, end } = path.node.loc;\n  if (start.line <= lineStart && lineEnd <= end.line) {\n    const size = end.line - start.line;\n    if (size > largestSize) {\n      largestSize = size;\n      largestEnclosingContext = path.node;\n    }\n  }\n  return { largestSize, largestEnclosingContext };\n};\n\nexport class JavascriptParser implements AbstractParser {\n  findEnclosingContext(\n    file: string,\n    lineStart: number,\n    lineEnd: number\n  ): EnclosingContext {\n    const ast = parser.parse(file, {\n      sourceType: "module",\n      plugins: ["jsx", "typescript"], // To allow JSX and TypeScri

In [46]:
augumented_query = "<CONTEXT>\n" + "\n\n-----------------\n\n".join(context) + "\n--------------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [48]:
print(augumented_query)

<CONTEXT>
src/context/language/javascript-parser.ts

import { AbstractParser, EnclosingContext } from "../../constants";
import * as parser from "@babel/parser";
import traverse, { NodePath, Node } from "@babel/traverse";

const processNode = (
  path: NodePath<Node>,
  lineStart: number,
  lineEnd: number,
  largestSize: number,
  largestEnclosingContext: Node | null
) => {
  const { start, end } = path.node.loc;
  if (start.line <= lineStart && lineEnd <= end.line) {
    const size = end.line - start.line;
    if (size > largestSize) {
      largestSize = size;
      largestEnclosingContext = path.node;
    }
  }
  return { largestSize, largestEnclosingContext };
};

export class JavascriptParser implements AbstractParser {
  findEnclosingContext(
    file: string,
    lineStart: number,
    lineEnd: number
  ): EnclosingContext {
    const ast = parser.parse(file, {
      sourceType: "module",
      plugins: ["jsx", "typescript"], // To allow JSX and TypeScript
    });
    let large

In [54]:
system_prompt = """You are a Senior Software Engineer, who is an expert in TypeScript.

Answer the question I have about the codebase based on the context provided.
Always consider all of the context provided to answer my question"""


llm_response = client.chat.completions.create(
    model="qwen/qwen2.5-vl-32b-instruct:free",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augumented_query}
    ]
)

response = llm_response.choices[0].message.content


In [55]:
print(response)

The JavaScript parser, defined in `src/context/language/javascript-parser.ts`, is used to analyze and extract meaningful context from JavaScript (and TypeScript) files based on the changes made in a pull request. Here's a detailed breakdown of how it is used:

### 1. **Implementation of `AbstractParser` Interface**
The `JavascriptParser` class implements the `AbstractParser` interface, which defines two methods:
- `findEnclosingContext(file: string, lineStart: number, lineEnd: number): EnclosingContext`
- `dryRun(file: string): { valid: boolean; error: string }`

#### `findEnclosingContext`:
This method is responsible for finding the largest enclosing context (e.g., a function or interface) that contains the lines of interest (`lineStart` to `lineEnd`). It uses the Babel parser and traverser to analyze the Abstract Syntax Tree (AST) of the file.

- **Steps**:
  1. Parses the file into an AST using `@babel/parser`.
  2. Traverses the AST to find nodes (e.g., `Function` or `TSInterfaceDe

In [56]:
system_prompt = """You are a Senior Software Engineer, who is an expert in TypeScript.

Answer the question I have about the codebase based on the context provided.
Always consider all of the context provided to answer my question"""


llm_response = client.chat.completions.create(
    model="openai/o4-mini-high",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augumented_query}
    ]
)

response = llm_response.choices[0].message.content


In [57]:
print(response)

In our code-review pipeline the JavascriptParser is really just a little wrapper around Babel that lets us carve your diff up by “what function (or interface) did I actually change?” instead of dumping you the whole  file or a raw git hunk.

Here’s the high-level flow of how it’s used:

1.  getParserForExtension (in constants.ts)  
    we register one single JavascriptParser instance for the extensions  
    `.js, .jsx, .ts, .tsx`.  

2.  smarterContextPatchStrategy (in src/context/review.ts)  
    when you ask for a “smart” diff context, we call  
    getParserForExtension(filename) → your JavascriptParser.  

3.  diffContextPerHunk(file, parser)  
    • we apply the patch to get the “updated” file text  
    • pull out each git hunk via `diff.parsePatch`  
    • for each hunk, compute the range of the *new* lines that were inserted  
    • **call** `parser.findEnclosingContext(updatedFile, lineStart, lineEnd)`  

4.  JavascriptParser.findEnclosingContext  
    • does a `@babel/parser

In [59]:
def perform_rag(query, model="qwen/qwen2.5-vl-32b-instruct:free"):

    query_embedding = get_huggingface_embeddings(query)

    top_matches = pinecone_index.query(vector=query_embedding.tolist(),
                                   top_k=5,
                                   include_metadata=True,
                                   namespace="https://github.com/CoderAgent/SecureAgent")

    context = [item['metadata']['text'] for item in top_matches['matches']]

    augumented_query = "<CONTEXT>\n" + "\n\n-----------------\n\n".join(context) + "\n--------------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

    system_prompt = """You are a Senior Software Engineer, who is an expert in TypeScript.

    Answer the question I have about the codebase based on the context provided.
    Always consider all of the context provided to answer my question"""


    llm_response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": augumented_query}
        ]
    )

    response = llm_response.choices[0].message.content

    print(response)


In [60]:
perform_rag("How is the javascript parser used?")

The JavaScript parser (`JavascriptParser`) is used to analyze and extract meaningful context from JavaScript files (including TypeScript and JSX files) based on the changes made in a pull request. Here's a detailed breakdown of how it is used:

### 1. **Implementation of `AbstractParser` Interface**
The `JavascriptParser` class implements the `AbstractParser` interface, which defines two methods:
- `findEnclosingContext(file: string, lineStart: number, lineEnd: number): EnclosingContext`
- `dryRun(file: string): { valid: boolean; error: string }`

#### `findEnclosingContext`:
This method is responsible for finding the largest enclosing context (e.g., a function or interface) that contains the lines of interest (`lineStart` to `lineEnd`). It uses the Babel parser and traverser to analyze the Abstract Syntax Tree (AST) of the file.

- **Steps**:
  1. Parse the file into an AST using `@babel/parser`.
  2. Traverse the AST using `@babel/traverse` to find nodes (e.g., `Function` or `TSInter