<a href="https://colab.research.google.com/github/Sruthij93/Codebase-RAG/blob/main/SJ_Codebase_RAG_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![Img](https://app.theheadstarter.com/static/hs-logo-opengraph.png)

# Headstarter Codebase RAG Project

![Screenshot 2024-11-25 at 7 12 58 PM](https://github.com/user-attachments/assets/0bd67cf0-43d5-46d2-879c-a752cae4c8e3)

# Install Necessary Libraries

In [1]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers

Collecting pygithub
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain
  Downloading langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (6.6 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting pynacl>=1.4.0 (from pygithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl.metadata (8.7 kB)
Collecting pyjwt>=2.4.0 (from pyjwt[crypto]>=2.4.0->pygithub)
  Downloading PyJWT-2.10.1-py3-none-any.whl.metadata (4.0 kB)
Collecting Deprecated (from pygithub)
  Downloading Deprecated-1

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
# from google.colab import userdata
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone

# Clone a GitHub Repo locally

In [7]:
def clone_repo(repo_url):
  repo_name = repo_url.split("/")[-1]
  repo_path = f"/Users/sruthi/Documents/My projects/HEADSTARTER/Codebase_RAG/Codebase-RAG/{repo_name}"
  Repo.clone_from(repo_url, str(repo_path))
  return str(repo_name)

In [50]:
path = clone_repo("https://github.com/Sruthij93/Movie-Recommendation.git")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [59]:
path = "/Users/sruthi/Documents/My projects/HEADSTARTER/Codebase_RAG/Codebase-RAG/Movie-Recommendation"

In [60]:
SUPPORTED_EXTENSIONS = [".py", ".js", ".tsx", ".ts", ".java",".cpp"]

#directories to be ignored. do not commit node_modules, .git for eg
IGNORED_DIRS = [".git", "node_modules", "dist", "__pycache__", ".next", ".vscode", ".env", "venv"]

In [62]:
def get_file_content(file_path, repo_path):
  #error handling
  try:

    with open(file_path, "r", encoding = "utf-8") as f:
      content = f.read()

      rel_path = os.path.relpath(file_path, repo_path)

      return {
          "name": rel_path,
          "content": content
      }

  except Exception as e:
    print(f"Error reading file {file_path} : {e}")
    return None


In [63]:
def get_main_files_content(repo_path: str):
   """
   Get content of supported code files from the local repository.


   Args:
       repo_path: Path to the local repository


   Returns:
       List of dictionaries containing file names and contents
   """
   files_content = []


   try:
       for root, _, files in os.walk(repo_path):
           # Skip if current directory is in ignored directories
           if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
               continue


           # Process each file in current directory
           for file in files:
               file_path = os.path.join(root, file)
               if os.path.splitext(file)[1] in SUPPORTED_EXTENSIONS:
                   file_content = get_file_content(file_path, repo_path)
                   if file_content:
                       files_content.append(file_content)


   except Exception as e:
       print(f"Error reading repository: {str(e)}")


   return files_content

In [64]:
file_content = get_main_files_content(path)

In [65]:
file_content

[{'name': 'app.py',
  'content': 'import streamlit as st\nimport pickle\nimport requests\n\ndef recommend(movie):\n    movie_index = movies[movies[\'title\'] == movie].index[0]\n    distances = similarity[movie_index]\n    rec_movies_list = sorted(list(enumerate(distances)), reverse = True, key=lambda x:x[1])[1:6]\n\n    recommended_movies = []\n    recommended_movie_posters = []\n    for i in rec_movies_list:\n        movie_id = movies.iloc[i[0]].movie_id\n        recommended_movie_posters.append(fetch_poster(movie_id))\n        recommended_movies.append(movies.iloc[i[0]].title)\n    return recommended_movies, recommended_movie_posters    \n\n\ndef fetch_poster(movie_id):\n    response = requests.get(\'https://api.themoviedb.org/3/movie/{}?api_key=464b2949746d666c96b76e01946ecd3c&language=en-US\'.format(movie_id))\n    data = response.json()\n    poster = "https://image.tmdb.org/t/p/w500/" + data[\'poster_path\']\n    return poster\n\n\nmovies= pickle.load(open(\'movies.pkl\', \'rb\')

# Embeddings

In [16]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

In [17]:
text = "This is a sample sentence"
embeddings = get_huggingface_embeddings(text)

In [19]:
embeddings

array([ 3.38363498e-02, -8.39074925e-02, -3.01863756e-02, -1.50362533e-02,
       -8.36342573e-02,  2.67099682e-02, -9.17350501e-03,  3.30602913e-03,
       -4.73221242e-02, -1.27413543e-02,  8.20041969e-02,  2.05611363e-02,
        1.98337436e-02, -3.27860676e-02,  1.76552739e-02, -6.64511546e-02,
        5.29538579e-02,  2.12427564e-02, -4.35398258e-02,  2.42592096e-02,
        3.84507887e-03,  1.64461117e-02,  3.46779972e-02,  2.50417646e-02,
        2.40206830e-02, -3.28407474e-02,  5.68538811e-03, -2.87025087e-02,
       -6.85416535e-03, -3.71620618e-02,  1.50978519e-02, -1.02681667e-02,
       -3.70138395e-03, -1.02162920e-01,  1.71727072e-06, -3.04961437e-03,
       -8.88463017e-03, -1.37526803e-02, -5.74572831e-02,  3.62290516e-02,
        4.53163497e-03,  6.52434677e-02, -1.65316102e-03,  5.79803251e-02,
       -1.97080281e-02,  2.01019887e-02,  5.05361110e-02,  3.05137690e-02,
       -3.38397026e-02,  5.72957732e-02, -4.07249387e-03, -1.28480140e-02,
       -4.37915809e-02, -

# Setting up Pinecone
**1. Create an account on [Pinecone.io](https://app.pinecone.io/)**

**2. Create a new index called "codebase-rag" and set the dimensions to 768. Leave the rest of the settings as they are.**

![Screenshot 2024-11-24 at 10 58 50 PM](https://github.com/user-attachments/assets/f5fda046-4087-432a-a8c2-86e061005238)



**3. Create an API Key for Pinecone**

![Screenshot 2024-11-24 at 10 44 37 PM](https://github.com/user-attachments/assets/e7feacc6-2bd1-472a-82e5-659f65624a88)


**4. Store your Pinecone API Key within Google Colab's secrets section, and then enable access to it (see the blue checkmark)**

![Screenshot 2024-11-24 at 10 45 25 PM](https://github.com/user-attachments/assets/eaf73083-0b5f-4d17-9e0c-eab84f91b0bc)



In [20]:
! pip install python-dotenv
! pip install --upgrade pip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-24.3.1


In [21]:
from dotenv import load_dotenv
load_dotenv()
pinecone_api_key= os.getenv("PINECONE_API_KEY")

In [23]:
# Set the PINECONE_API_KEY as an environment variable
# pinecone_api_key = userdata.get("PINECONE_API_KEY")
# os.environ['PINECONE_API_KEY'] = pinecone_api_key

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key,)

# Connect to your Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [24]:
vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())

  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())
  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())


In [66]:
# Insert the codebase embeddings into Pinecone
documents = []

for file in file_content:
  doc = Document(
      page_content = f"{file['name']}\n{file['content']}",
      metadata = {"source" : file['name']} # keep track of the path of the file if it is a large codespace
  )
  documents.append(doc)

vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace="https://github.com/Sruthij93/Movie-Recommendation"
) # insert into pinecone directly









  embedding=HuggingFaceEmbeddings(),


# Perform RAG

1. Get your Groq API Key [here](https://console.groq.com/keys)

2. Paste your Groq API Key into your Google Colab secrets, and make sure to enable permissions for it

![Screenshot 2024-11-25 at 12 00 16 AM](https://github.com/user-attachments/assets/e5525d29-bca6-4dbd-892b-cc770a6b281d)


In [26]:
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.getenv("GROQ_API_KEY")
)

In [67]:
query = "explain what this app does."

In [68]:
query_embedding = get_huggingface_embeddings(query)

In [37]:
query_embedding

array([ 5.39445318e-02, -4.25215438e-02,  1.45429121e-02,  6.92990329e-03,
        1.09351715e-02,  3.79858129e-02,  8.64060316e-03, -9.36198141e-03,
        9.22756456e-03, -6.79123998e-02,  4.73060198e-02,  5.35087800e-03,
        1.45903891e-02,  5.59657067e-02, -3.11565325e-02, -3.09478492e-02,
        3.46395113e-02,  7.22381100e-03,  7.62414485e-02,  2.02651452e-02,
       -5.29880114e-02,  3.53080519e-02,  1.19579410e-04,  3.22537012e-02,
        9.25817178e-04,  5.09723974e-03, -6.42280281e-03,  4.94619831e-03,
       -3.84476520e-02, -1.22849517e-01, -2.87414491e-02, -2.99173091e-02,
        4.25344370e-02,  4.63509699e-03,  1.25876124e-06, -2.70571709e-02,
       -5.37696816e-02,  1.60054059e-03, -1.80131085e-02,  6.11494482e-03,
        6.74171969e-02, -5.70848063e-02,  4.09752764e-02,  2.27653328e-03,
       -6.61606714e-02, -3.05511560e-02,  3.03801447e-02, -4.50815074e-03,
        7.45328441e-02,  8.30167383e-02, -8.39436054e-03, -5.58911785e-02,
        1.50573412e-02,  

In [69]:
top_matches = pinecone_index.query(
    vector=query_embedding.tolist(),
    top_k=5,
    include_metadata=True,
    namespace="https://github.com/Sruthij93/Movie-Recommendation"
)

In [70]:
top_matches

{'matches': [{'id': '1440025e-2f9f-49e1-98bc-d716c427687e',
              'metadata': {'source': 'app.py',
                           'text': 'app.py\n'
                                   'import streamlit as st\n'
                                   'import pickle\n'
                                   'import requests\n'
                                   '\n'
                                   'def recommend(movie):\n'
                                   "    movie_index = movies[movies['title'] "
                                   '== movie].index[0]\n'
                                   '    distances = similarity[movie_index]\n'
                                   '    rec_movies_list = '
                                   'sorted(list(enumerate(distances)), reverse '
                                   '= True, key=lambda x:x[1])[1:6]\n'
                                   '\n'
                                   '    recommended_movies = []\n'
                                   '    r

In [71]:
contexts = [item["metadata"]["text"] for item in top_matches["matches"]]

In [73]:
augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[:10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [74]:
print(augmented_query)

<CONTEXT>
app.py
import streamlit as st
import pickle
import requests

def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    rec_movies_list = sorted(list(enumerate(distances)), reverse = True, key=lambda x:x[1])[1:6]

    recommended_movies = []
    recommended_movie_posters = []
    for i in rec_movies_list:
        movie_id = movies.iloc[i[0]].movie_id
        recommended_movie_posters.append(fetch_poster(movie_id))
        recommended_movies.append(movies.iloc[i[0]].title)
    return recommended_movies, recommended_movie_posters    


def fetch_poster(movie_id):
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key=464b2949746d666c96b76e01946ecd3c&language=en-US'.format(movie_id))
    data = response.json()
    poster = "https://image.tmdb.org/t/p/w500/" + data['poster_path']
    return poster


movies= pickle.load(open('movies.pkl', 'rb'))
movies_list = movies['title'].values
similarity = 

In [75]:
system_prompt = f"""you are a senior software engineer, specializing in coding of python, streamlit and typescript apps.

Answer any questions I have about the codebase, based on all the context provided.
Always consider all of the context provided when forming the response.

Let's think step by step."""



llm_response = client.chat.completions.create(
  model="llama-3.1-8b-instant",
  messages=[
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": augmented_query}
  ]
)

response = llm_response.choices[0].message.content



In [76]:
print(response)

Based on the provided code, this app is a simple Movie Recommender System built using Streamlit.

Here's a breakdown of what the app does:

1. **User Input**: The app presents a dropdown list of movie titles using the `selectbox` widget from Streamlit. The list is populated from a file called `movies.pkl`, which is loaded using the `pickle` library. This file likely contains a Pandas DataFrame with movie information, including the title and id.

2. **Calculation of Recommendations**: When the user clicks the "Recommend" button, the app calls the `recommend(movie)` function with the selected movie title as an argument. This function:
	* Finds the index of the selected movie in the `movies` DataFrame.
	* Retrieves the similarity scores between the selected movie and other movies in the dataset.
	* Returns the titles and poster URLs of the top 5 most similar movies (excluding the original selected movie).

3. **Displaying Recommendations**: The app uses Streamlit's grid functionality to d

In [78]:
def perform_rag(query):
   raw_query_embedding = get_huggingface_embeddings(query)


   top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace="https://github.com/Sruthij93/Movie-Recommendation")


   # Get the list of retrieved texts
   contexts = [item['metadata']['text'] for item in top_matches['matches']]


   augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query


   # Modify the prompt below as need to improve the response quality
   system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.


   Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
   """


   llm_response = client.chat.completions.create(
       model="llama-3.1-8b-instant",
       messages=[
           {"role": "system", "content": system_prompt},
           {"role": "user", "content": augmented_query}
       ]
   )


   return llm_response.choices[0].message.content

In [79]:
response = perform_rag("how can i improve this app?")

print(response)

There are several improvements that can be suggested for the provided app:

1. **Data Storage and Loading:**
   - Instead of using `pickle` to load the movie data and similarity matrix, consider using a more robust and efficient way to store and load the data, such as a SQLite database or a Cloud Storage like Google Drive.
   - Use a clear and consistent naming convention, e.g., `movie_data.pkl` for the data and `movie_similarity_matrix.pkl` for the matrix.

2. **Error Handling:**
   - Add try-except blocks to handle potential exceptions, such as:
     - When the API request to fetch movie posters fails.
     - When the similarity matrix or movie data is not loaded correctly.
     - When the selected movie is not found in the data.

3. **Code Organization:**
   - Move the data loading and similarity calculation code to a separate function or module, so it's not part of the main app code.
   - Consider refactoring the `recommend` function to use more descriptive variable names and to fo

## Streamlit App with Chatbot


In [80]:
import streamlit as st
import os
from github import Github
from git import Repo
from sentence_transformers import SentenceTransformer
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI

In [None]:
%% writefile /Users/sruthi/Documents/My projects/HEADSTARTER/Codebase_RAG/Codebase-RAG/app.py

# Initialize Pinecone
pc = Pinecone(api_key=st.secrets["PINECONE_API_KEY"])
pinecone_index = pc.Index("codebase-rag")

# Initialize OpenAI client
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=st.secrets["GROQ_API_KEY"]
)

# perform rag
def perform_rag(query, namespace):
   raw_query_embedding = get_huggingface_embeddings(query)


   top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace=namespace)


   # Get the list of retrieved texts
   contexts = [item['metadata']['text'] for item in top_matches['matches']]


   augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query


   # Modify the prompt below as need to improve the response quality
   system_prompt = f"""You are a Senior Software Engineer, specializing in TypeScript.


   Answer any questions I have about the codebase, based on the code provided. Always consider all of the context provided when forming a response.
   """


   llm_response = client.chat.completions.create(
       model="llama-3.1-8b-instant",
       messages=[
           {"role": "system", "content": system_prompt},
           {"role": "user", "content": augmented_query}
       ]
   )


   return llm_response.choices[0].message.content


# List of embedded GitHub repos
repos = [
    "https://github.com/CoderAgent/SecureAgent",
    "https://github.com/Sruthij93/Movie-Recommendation"
]

# Main UI
st.title("🤖 CodeSage 🤖")