# RAGit

### Let's create a function load repo

In [None]:
import os
import shutil
from git import Repo

In [None]:
def clone_repo(repo_url, clone_dir="repo_temp"):
    try:
        if os.path.exists(clone_dir):
            shutil.rmtree(clone_dir)
        print("Cloning repositry...")
        Repo.clone_from(repo_url, clone_dir)
        print(f"✅ Repo cloned to: {clone_dir}")
        return clone_dir
    except GitCommandError as e:
        print(f"❌ Git error: {e}")
        return None
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        return None

In [None]:
repo_url = "https://github.com/Sohan-Choudhary05/RAGit"
repo_temp = r"C:\Users\Ram Choudhary\OneDrive\Desktop\RAGit\rpo_tmp"
clone_repo(repo_url, repo_temp)

### Extract the files from that Repo

In [None]:
def load_repo_files(clone_dir):
    supported_extensions = ['.py','.md','.js','.ts','.html','.txt']
    files_data = []

    for root,_,files in os.walk(clone_dir):
        for file in files:
            if any(file.endswith(ext) for ext in supported_extensions):
                file_path = os.path.join(root,file)
                try:
                    with open(file_path,'r',encoding = "utf-8", errors='ignore') as f:
                        content = f.read()
                        files_data.append({
                            "path":file_path,
                            "content":content
                        })
                except Exception as e:
                    print(f"[!] Failed to read file:{file_path} — {str(e)} ")
    return files_data

In [11]:
loaded_files = load_repo_files(r"C:\Users\Ram Choudhary\OneDrive\Desktop\RAGit\rpo_tmp")

### Let's chunk the documents/code files

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents(file_data,chunk_size=500,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        separators=["\n\n","\n"," ",""]
    )

    chunks = []
    for file in file_data:
        splits = text_splitter.split_text(file["content"])
        for item, chunk in enumerate(splits):
            chunks.append({
                "text":chunk,
                "metadata":{
                    "source":file["path"],
                    "chunk_id":item
                }
            })
    return chunks

In [None]:
chunk_documents(loaded_files,chunk_size=500,chunk_overlap=50)