<a href="https://colab.research.google.com/github/TannaPrasanthkumar/CodeGenerator-Using-RAG/blob/main/CodeGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain_core langchain_chroma langchain_google_genai langchain_huggingface

Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_core
  Downloading langchain_core-0.2.32-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-1.0.8-py3-none-any.whl.metadata (3.8 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.99-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain_core)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.me

In [None]:
import os
import langchain

In [None]:
from google.colab import userdata

In [None]:
GITHUB_REPO  = userdata.get("GITHUB_REPO")

GITHUB_ACCESS_TOKEN = userdata.get("GITHUB_ACCESS_TOKEN")

In [None]:
import requests
import time

def crawl_github_repo(repo_url, github_token):

    ignore_list = ['__init__.py']

    api_url = f"https://api.github.com/repos/{repo_url}/contents"
    headers = {
        "Accept": "application/vnd.github.v3+json",
        "Authorization": f"Bearer {github_token}"
    }

    def get_files(url):
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        files = []
        for item in response.json():
            if item['type'] == 'file' and (item['name'] not in ignore_list) and (item['name'].endswith('.py') or item['name'].endswith('.ipynb')):
                files.append(item['html_url'])
            elif item['type'] == 'dir' :
                files.extend(get_files(item['url']))
                time.sleep(0.1)  # Avoid hitting rate limits
        return files

    return get_files(api_url)

code_files_urls = crawl_github_repo(GITHUB_REPO, GITHUB_ACCESS_TOKEN)


In [None]:
print(code_files_urls)

['https://github.com/GoogleCloudPlatform/generative-ai/blob/main/.github/workflows/issue_assigner/assign_issue.py', 'https://github.com/GoogleCloudPlatform/generative-ai/blob/main/conversation/data-store-status-checker/data_store_checker.ipynb', 'https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb', 'https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro-textemb-vectorsearch.ipynb', 'https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro_embeddings_tuning.ipynb', 'https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro_multimodal_embeddings.ipynb', 'https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/use-cases/outlier-detection/bq-vector-search-log-outlier-detection.ipynb', 'https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/vector-search-quickstart.ipynb', 'https://github.com/GoogleCloudPlatform/generative

In [None]:
for url in code_files_urls:
  print(url)

https://github.com/GoogleCloudPlatform/generative-ai/blob/main/.github/workflows/issue_assigner/assign_issue.py
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/conversation/data-store-status-checker/data_store_checker.ipynb
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/embedding-similarity-visualization.ipynb
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro-textemb-vectorsearch.ipynb
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro_embeddings_tuning.ipynb
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/intro_multimodal_embeddings.ipynb
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/use-cases/outlier-detection/bq-vector-search-log-outlier-detection.ipynb
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/vector-search-quickstart.ipynb
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/chat-

In [None]:
print(len(code_files_urls))

220


In [None]:
# Save                                                                                               URLs to a file
with open('code_files_urls.txt', 'w') as f:
    for url in code_files_urls:
        f.write(f"{url}\n")

In [None]:
import requests
import nbformat
import json

In [None]:
import requests
import nbformat
from langchain.schema import Document

def extract_python_code_from_ipynb(github_url, cell_type="code"):

    raw_url = github_url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')

    response = requests.get(raw_url)
    response.raise_for_status()

    notebook_content = response.text
    notebook = nbformat.reads(notebook_content, as_version = nbformat.NO_CONVERT)

    python_code = ""  # Initialize to an empty string

    for cell in notebook.cells:
        if cell.cell_type == cell_type:
            python_code += cell.source + "\n"  # Concatenate code with new line

    return python_code


code_strings = []

for i in range(len(code_files_urls)):

    if code_files_urls[i].endswith(".ipynb"):

        content = extract_python_code_from_ipynb(code_files_urls[i], "code")
        doc = Document(page_content=content, metadata={"url" : code_files_urls[i], "file_index" : i})
        code_strings.append(doc)

# Access the first code string
print(code_strings[0])

  validate(nb)


page_content='%pip install --upgrade google-cloud-discoveryengine humanize
import sys

if "google.colab" in sys.modules:
    from google.auth import default
    from google.colab import auth

    auth.authenticate_user()
    creds, _ = default()
else:
    # Otherwise, attempt to discover local credentials as described on https://cloud.google.com/docs/authentication/application-default-credentials
    pass
import humanize
import time
import re
from typing import List, Optional

from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine_v1beta as discoveryengine


def _call_list_documents(
    project_id: str, location: str, datastore_id: str, page_token: Optional[str] = None
) -> discoveryengine.ListDocumentsResponse:
    """Build the List Docs Request payload."""
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengin

In [None]:
print(len(code_files_urls))

220


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size= 2000,
    chunk_overlap = 200
)

In [None]:
chunks = text_splitter.split_documents(code_strings)

In [None]:
key = userdata.get("GOOGLE_API_KEY")

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(
    model = "models/embedding-001",
    google_api_key = key
)

In [None]:
!pip install langchain_chroma



In [None]:
from langchain_chroma import Chroma

In [None]:
import os
from datetime import datetime

# Create a unique directory name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
persistent_directory = f"/content/chroma_db_{timestamp}"

# Ensure the directory exists
os.makedirs(persistent_directory, exist_ok=True)

# Initialize Chroma
db = Chroma(
    collection_name="code_files",
    embedding_function=embeddings,
    persist_directory=persistent_directory
)

print(f"Chroma DB initialized at: {persistent_directory}")

Chroma DB initialized at: /content/chroma_db_20240816_060615


In [None]:
db.add_documents(chunks)

['fa2a2e71-70dd-4a75-95f1-87ee69c5c99b',
 'c9d2a00d-eaf8-40e7-bc94-d6c608cd5b50',
 'd01309a0-6b1e-48bb-9fae-78597cbe54a5',
 '73156ae8-29ae-4de0-a4dd-8377767e3421',
 'a0ffc1a2-e813-4f7c-b33d-e27e3083e15f',
 '81886bb1-b8e7-49f2-99ae-9e03aefccaf2',
 'bbfec423-02ac-4c81-a191-233c56b84402',
 '2ae62090-7633-421f-908f-f2693761c682',
 '4c9cada4-d7c1-4a44-bf80-b6764429ca56',
 '9648dade-4dd1-4b32-9e55-2f14e0b301c7',
 'c41f54e7-dcee-4dcd-ab26-4ae7ec4853f9',
 '8d15c150-22f6-4e49-a946-1a4ce711013d',
 '76168c0c-ea9a-42cd-933b-91d244547574',
 'aa9bc262-7fac-4d55-98f5-3e318a86c0bf',
 '97dbd7f2-249d-4870-8da5-8a8375d4f043',
 '85d955b8-04c2-4d79-867a-9991aa9f1801',
 'fd601af3-2f42-4889-aa3f-35e9db8339b7',
 'c03c090f-806c-43a8-b9e0-f6fa5927e041',
 'd451b584-d950-4a05-aeea-b15b88f28746',
 'cb793295-43c2-42e0-8535-b323f65eb978',
 '7e691688-bd5b-48e6-9bf7-7e3ad1ee062c',
 '95001cc9-9cba-4757-a2a9-4b002de42791',
 '086db96a-0501-448e-9131-4396502e374b',
 '34a721ea-cc2f-4f78-bf3e-a29aa75cffe3',
 '67ae925b-a078-

In [None]:
retriever = db.as_retriever(search_type = "similarity", kwargs = {"K":2})

In [None]:
question = "write a python fucntion that will take two numbers as parameters and return the sum"

In [None]:
from langchain_core.prompts import PromptTemplate

In [None]:
# Zero Shot prompt template
prompt_zero_shot = """
    You are a proficient python developer. Respond with the syntactically correct & concise code for to the question below.

    Question:
    {question}

    Output Code :
    """

prompt_prompt_zero_shot = PromptTemplate(
input_variables = ["question"],
template = prompt_zero_shot,
)

In [None]:
from langchain_google_genai import GoogleGenerativeAI

In [None]:
from langchain.chains import RetrievalQA

In [None]:
prompt_RAG = """
    You are a proficient python developer. Respond with the syntactically correct code for to the question below. Make sure you follow these rules:
    1. Use context to understand the APIs and how to use it & apply.
    2. Do not add license information to the output code.
    3. Do not include colab code in the output.
    4. Ensure all the requirements in the question are met.

    Question:
    {question}

    Context:
    {context}

    Helpful Response :
    """

In [None]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

In [None]:
from langchain_google_genai import GoogleGenerativeAI

model = GoogleGenerativeAI(
    model = "models/gemini-1.5-flash",
    temperature = 0.2,
    google_api_key = key
)

In [None]:
from langchain_core.runnables import RunnablePassthrough

chain = {"context":retriever, "question":RunnablePassthrough()} | prompt_prompt_zero_shot | model | output_parser

In [None]:
question = "write a python fucntion that will take two numbers as parameters and return the sum"

In [None]:
response = chain.invoke(question)

In [None]:
print(response)

```python
def sum_two_numbers(num1, num2):
  """Returns the sum of two numbers."""
  return num1 + num2
```
