![Img](https://app.theheadstarter.com/static/hs-logo-opengraph.png)

# Headstarter Codebase RAG Project

![Screenshot 2024-11-25 at 7 12 58 PM](https://github.com/user-attachments/assets/0bd67cf0-43d5-46d2-879c-a752cae4c8e3)

# Install Necessary Libraries

In [1]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers

Collecting pygithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pynacl>=1.4.0 (from pygithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading langchain-0.3.9-py

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone

  from tqdm.autonotebook import tqdm, trange


# Clone a GitHub Repo locally

In [3]:
# Function to clone repo into env
def clone_repo(repo_url):
    repo_name = repo_url.split("/")[-1] #split by the hash, obtain last element
    repo_path = f"/content/{repo_name}"
    Repo.clone_from(repo_url, str(repo_path))
    return str(repo_name)

In [4]:
clone_repo("https://www.github.com/CoderAgent/SecureAgent")

'SecureAgent'

In [5]:
path = '/content/SecureAgent'

In [6]:
# List of file extensions and directories to parse
SUPPORTED_EXTENSIONS = ['.py', '.java', '.cpp', '.tsx', '.ts', '.jsx', '.js']
IGNORED_DIRS = ['.git', 'node_modules', 'dist', '__pycache__', '.next', '.vscode', 'env', 'venv']

In [19]:
# function to extract content
def get_file_content(file_path, repo_path):
    """
    Get content of a single file.

    Args:
        file_path (str): Path to the file

    Returns:
        Optional[Dict[str, str]]: Dictionary with file name and content
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Get relative path from repo root
        rel_path = os.path.relpath(file_path, repo_path)

        return {
            "name": rel_path,
            "content": content
        }
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

In [20]:
def get_main_files_content(repo_path: str):
    """
        List of dictionaries containing file names and contents
    """
    files_content = []

    try:
        for root, _, files in os.walk(repo_path):
            # Skip if current directory is in ignored directories
            if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
                continue

            # Process each file in current directory
            for file in files:
                file_path = os.path.join(root, file)
                if os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS:
                    file_content = get_file_content(file_path, repo_path)
                    if file_content:
                        files_content.append(file_content)

    except Exception as e:
        print(f"Error reading repository: {str(e)}")

    return files_content

In [21]:
file_content = get_main_files_content(path)
file_content

[{'name': 'src/env.ts',
  'content': 'import * as dotenv from "dotenv";\nimport { createPrivateKey } from "crypto";\nimport chalk from "chalk";\n\ndotenv.config();\n\nexport const env = {\n  GITHUB_APP_ID: process.env.GITHUB_APP_ID,\n  GITHUB_PRIVATE_KEY: process.env.GITHUB_PRIVATE_KEY,\n  GITHUB_WEBHOOK_SECRET: process.env.GITHUB_WEBHOOK_SECRET,\n  GROQ_API_KEY: process.env.GROQ_API_KEY,\n} as const;\n\nlet valid = true;\n\nfor (const key in env) {\n  if (!env[key as keyof typeof env]) {\n    console.log(\n      chalk.red("✖") +\n        chalk.gray(" Missing required env var: ") +\n        chalk.bold(`process.env.${key}`)\n    );\n    valid = false;\n  }\n}\n\ntry {\n  createPrivateKey(env.GITHUB_PRIVATE_KEY);\n} catch (error) {\n  console.log(\n    chalk.red(\n      "\\n✖ Invalid GitHub private key format for " +\n        chalk.bold(`process.env.GITHUB_PRIVATE_KEY`) +\n        "\\n"\n    ) +\n      chalk.gray("  • Must start with: ") +\n      chalk.bold("-----BEGIN RSA PRIVATE KEY---

# Embeddings

In [22]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

In [24]:
text = 'I am a programmer'
get_huggingface_embeddings(text)

array([ 1.81737728e-02, -3.02659534e-03, -4.77465056e-02,  1.86379589e-02,
        3.14538032e-02,  1.87255573e-02, -1.52534526e-02, -6.77293763e-02,
       -1.26904100e-02,  1.28427539e-02,  5.80701083e-02,  4.00234982e-02,
        3.27073336e-02,  7.12998286e-02,  5.56373261e-02,  1.68628842e-02,
        6.97604120e-02, -5.02619967e-02,  6.13144785e-03, -1.46559263e-02,
       -4.51954128e-03,  4.82934080e-02, -2.53051482e-02, -1.97865185e-03,
       -4.36902903e-02, -2.41507702e-02,  1.29505172e-02, -3.78603698e-03,
       -2.05719229e-02,  1.09819129e-01,  3.07674590e-03, -2.80443188e-02,
       -1.55807380e-02, -1.24790529e-02,  1.75239245e-06, -2.93755950e-03,
       -1.43048624e-02,  4.88386266e-02, -6.21115230e-02,  2.95061693e-02,
       -1.40470909e-02,  2.20707953e-02,  1.13067841e-02,  4.70892675e-02,
        7.58306822e-03, -8.30540375e-05,  6.67821839e-02, -1.21320244e-02,
        4.39395290e-03,  2.47454122e-02,  1.02529097e-02, -6.54437998e-03,
       -5.53141721e-03, -

# Setting up Pinecone
**1. Create an account on [Pinecone.io](https://app.pinecone.io/)**

**2. Create a new index called "codebase-rag" and set the dimensions to 768. Leave the rest of the settings as they are.**

![Screenshot 2024-11-24 at 10 58 50 PM](https://github.com/user-attachments/assets/f5fda046-4087-432a-a8c2-86e061005238)



**3. Create an API Key for Pinecone**

![Screenshot 2024-11-24 at 10 44 37 PM](https://github.com/user-attachments/assets/e7feacc6-2bd1-472a-82e5-659f65624a88)


**4. Store your Pinecone API Key within Google Colab's secrets section, and then enable access to it (see the blue checkmark)**

![Screenshot 2024-11-24 at 10 45 25 PM](https://github.com/user-attachments/assets/eaf73083-0b5f-4d17-9e0c-eab84f91b0bc)



In [25]:
# Set the PINECONE_API_KEY as an environment variable
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [26]:
vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())

  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())
  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())


In [31]:
# Insert the codebase embeddings into Pinecone
documents = []
for file in file_content:
    document = Document(page_content=f"{file['name']}\n{file['content']}")
    metadata = {"source": file['name']}
    documents.append(document)

vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace = "https://github.com/CoderAgent/SecureAgent"
)


  embedding=HuggingFaceEmbeddings(),


# Perform RAG

1. Get your Groq API Key [here](https://console.groq.com/keys)

2. Paste your Groq API Key into your Google Colab secrets, and make sure to enable permissions for it

![Screenshot 2024-11-25 at 12 00 16 AM](https://github.com/user-attachments/assets/e5525d29-bca6-4dbd-892b-cc770a6b281d)


In [32]:
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=userdata.get("GROQ_API_KEY")
)

In [33]:
query = "How is the Javascript parser used?"

In [34]:
query_embedding = get_huggingface_embeddings(query)

In [37]:
# search for relevant vectors
top_matches = pinecone_index.query(
    vector=query_embedding.tolist(),
    top_k=5,
    include_metadata=True,
    namespace = "https://github.com/CoderAgent/SecureAgent"
)

In [38]:
top_matches

{'matches': [{'id': '0f2afeab-1528-4d6c-9e66-3d54a3734679',
              'metadata': {'text': 'src/context/language/javascript-parser.ts\n'
                                   'import { AbstractParser, EnclosingContext '
                                   '} from "../../constants";\n'
                                   'import * as parser from "@babel/parser";\n'
                                   'import traverse, { NodePath, Node } from '
                                   '"@babel/traverse";\n'
                                   '\n'
                                   'const processNode = (\n'
                                   '  path: NodePath<Node>,\n'
                                   '  lineStart: number,\n'
                                   '  lineEnd: number,\n'
                                   '  largestSize: number,\n'
                                   '  largestEnclosingContext: Node | null\n'
                                   ') => {\n'
                             

In [39]:
contexts = [item['metadata']['text'] for item in top_matches['matches']]
contexts

['src/context/language/javascript-parser.ts\nimport { AbstractParser, EnclosingContext } from "../../constants";\nimport * as parser from "@babel/parser";\nimport traverse, { NodePath, Node } from "@babel/traverse";\n\nconst processNode = (\n  path: NodePath<Node>,\n  lineStart: number,\n  lineEnd: number,\n  largestSize: number,\n  largestEnclosingContext: Node | null\n) => {\n  const { start, end } = path.node.loc;\n  if (start.line <= lineStart && lineEnd <= end.line) {\n    const size = end.line - start.line;\n    if (size > largestSize) {\n      largestSize = size;\n      largestEnclosingContext = path.node;\n    }\n  }\n  return { largestSize, largestEnclosingContext };\n};\n\nexport class JavascriptParser implements AbstractParser {\n  findEnclosingContext(\n    file: string,\n    lineStart: number,\n    lineEnd: number\n  ): EnclosingContext {\n    const ast = parser.parse(file, {\n      sourceType: "module",\n      plugins: ["jsx", "typescript"], // To allow JSX and TypeScript

In [43]:
augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query
print(augmented_query)

<CONTEXT>
src/context/language/javascript-parser.ts
import { AbstractParser, EnclosingContext } from "../../constants";
import * as parser from "@babel/parser";
import traverse, { NodePath, Node } from "@babel/traverse";

const processNode = (
  path: NodePath<Node>,
  lineStart: number,
  lineEnd: number,
  largestSize: number,
  largestEnclosingContext: Node | null
) => {
  const { start, end } = path.node.loc;
  if (start.line <= lineStart && lineEnd <= end.line) {
    const size = end.line - start.line;
    if (size > largestSize) {
      largestSize = size;
      largestEnclosingContext = path.node;
    }
  }
  return { largestSize, largestEnclosingContext };
};

export class JavascriptParser implements AbstractParser {
  findEnclosingContext(
    file: string,
    lineStart: number,
    lineEnd: number
  ): EnclosingContext {
    const ast = parser.parse(file, {
      sourceType: "module",
      plugins: ["jsx", "typescript"], // To allow JSX and TypeScript
    });
    let larges

In [48]:
system_prompt = f"""
  You are a senior software engineer, specializing in Typescript.
  Answer any questions I have on the codebase, based on the context provided.
  Always consider all of the provided context when forming a response.
  Let's think through each step. Verify each step.
"""


llm_response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query}
    ]
)

response = llm_response.choices[0].message.content

In [49]:
print(response)

The Javascript parser is used in the `src/context/review.ts` file.

Specifically, it's used in the `smarterContextPatchStrategy` function. This function attempts to determine the best patch strategy based on the file extension of the PR file. 

If the file's extension is known to be one of the extensions supported by the Javascript parser (js, ts, jsx, or tsx), it uses the `functionContextPatchStrategy` function, which in turn uses the `JavascriptParser` class to identify the largest enclosing function context associated with a set of edit lines in the file. It then uses this information to construct a patch strategy that expands the context around the function scope.

Here is the relevant code snippet:

```typescript
export const smarterContextPatchStrategy = (file: PRFile) => {
  const parser: AbstractParser = getParserForExtension(file.filename);
  if (parser != null) {
    return functionContextPatchStrategy(file, parser);
  } else {
    return expandedPatchStrategy(file);
  }
};
`