# Imports

In [3]:
# Install required packages
!pip install PyGithub python-dotenv
!pip install openai langchain faiss-cpu
!pip install -U langchain-openai
!pip install -U langchain_chroma
!pip install -U langchain-community
!pip install anytree
!pip install yaspin
!pip install rank_bm25



# Fetch GitHub repo

In [4]:
import os
import json
import base64
from github import Github, Auth, UnknownObjectException
import tiktoken
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
import json
from anytree import Node, RenderTree

# Use Colab's Secrets feature for secure authentication
from google.colab import userdata

class GitHubRepoFetcher:
    def __init__(self):
        # Get GitHub PAT from Colab Secrets
        self.pat = userdata.get("GITHUB_PAT")
        self.auth = Auth.Token(self.pat)
        self.g = Github(auth=self.auth)
        self.output = {
            "repo_name": "",
            "files": [],
            "submodules": []
        }
        # Patterns to exclude from results
        self.excluded_patterns = {
            '.DS_Store', '___MACOSX', '.o', '.swp', '.swiftpm',
            'bwspblob', 'dsclbool', 'lg1Scomp', 'moDDblob',
            'ph1Scomp', 'vSrnlong', 'Ilocblob', 'modDblob'
        }

    def _get_default_branch(self, repo):
        try:
            return repo.default_branch
        except:
            return "main"

    def _should_exclude(self, path):
        """Check if file path matches exclusion patterns"""
        return any(pattern in path for pattern in self.excluded_patterns)

    def _clean_content(self, content, path):
        """Clean and format file contents"""
        # Skip binary files
        if b'\x00' in content[:1024]:
            print(f"🚫 Skipped binary file: {path}")
            return None

        # Format C++ files
        if path.endswith(('.cpp', '.h', '.hpp')):
            try:
                return '\n'.join([line.rstrip() for line in content.decode('utf-8').split('\n')])
            except UnicodeDecodeError:
                return None

        return content.decode('utf-8', errors='replace')

    def _get_file_content(self, repo, path, ref):
        try:
            content = repo.get_contents(path, ref=ref)
            if content.encoding == "base64":
                decoded = base64.b64decode(content.content)
                return self._clean_content(decoded, path)
            return None
        except Exception as e:
            print(f"⚠️ Skipped {path}: {str(e)}")
            return None

    def _process_repo(self, repo_name, path="", prefix=""):
        try:
            repo = self.g.get_repo(repo_name)
            default_branch = self._get_default_branch(repo)
            files = []
            submodules = []

            try:
                contents = repo.get_contents(path, ref=default_branch)
            except UnknownObjectException:
                return {"files": [], "submodules": []}

            for item in contents:
                if self._should_exclude(item.path):
                    continue

                full_path = os.path.join(prefix, item.path)
                if item.type == "file":
                    content = self._get_file_content(repo, item.path, default_branch)
                    if content:
                        files.append({
                            "file_path": full_path,
                            "content": content
                        })
                elif item.type == "dir":
                    subdir_data = self._process_repo(
                        repo_name,
                        path=item.path,
                        prefix=os.path.join(prefix, item.path)
                    )
                    files.extend(subdir_data["files"])
                    submodules.extend(subdir_data["submodules"])
                elif item.type == "submodule":
                    submodule_data = self._process_submodule(item, prefix)
                    if submodule_data:
                        submodules.append(submodule_data)

            return {
                "files": files,
                "submodules": submodules
            }
        except Exception as e:
            print(f"💥 Error in {repo_name}: {str(e)}")
            return {"files": [], "submodules": []}

    def _process_submodule(self, submodule_item, prefix):
        try:
            sub_url = submodule_item.html_url
            repo_path = sub_url.replace("https://github.com/", "")
            sub_repo = self.g.get_repo(repo_path)
            commit_sha = submodule_item.sha or sub_repo.get_branch(
                self._get_default_branch(sub_repo)
            ).commit.sha

            sub_data = self._process_repo(
                repo_path,
                prefix=os.path.join(prefix, submodule_item.path)
            )

            return {
                "name": submodule_item.path,
                "url": sub_url,
                "commit": commit_sha,
                "files": sub_data["files"],
                "submodules": sub_data["submodules"]
            }
        except Exception as e:
            print(f"❌ Submodule failed: {str(e)}")
            return None

    def fetch_repo_structure(self, repo_name):
        try:
            result = self._process_repo(repo_name)
            self.output.update({
                "repo_name": repo_name,
                **result
            })
            return self.output
        except Exception as e:
            print(f"💥 Critical error: {str(e)}")
            return None

    def save_as_json(self, data, filename="repo_contents.json"):
        """Save with cleaned structure and sorted entries"""
        cleaned_data = {
            "repo_name": data["repo_name"],
            "files": sorted(data["files"], key=lambda x: x["file_path"]),
            "submodules": sorted(data["submodules"], key=lambda x: x["name"])
        }

        with open(filename, "w") as f:
            json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
        print(f"✅ Cleaned structure saved to {filename}")

# Map file extensions to LangChain Language enums
LANGUAGE_SPLITTER_MAP = {
    ".py": Language.PYTHON,
    ".js": Language.JS,
    ".cpp": Language.CPP,
    ".java": Language.JAVA,
    ".php": Language.PHP,
    ".go": Language.GO,
    ".rst": Language.RST,
    ".scala": Language.SCALA,
    ".swift": Language.SWIFT,
    ".md": Language.MARKDOWN,
    ".tex": Language.LATEX,
    ".html": Language.HTML,
    ".sol": Language.SOL,
    ".proto": Language.PROTO,}

# Token-based chunking for unsupported file types
def chunk_by_tokens(encoded_tokens, chunk_size, overlap_size):
  chunks = []
  start_idx = 0
  while start_idx < len(encoded_tokens):
    end_idx = start_idx + chunk_size
    chunk = encoded_tokens[start_idx:end_idx]
    chunks.append(chunk)
    start_idx += chunk_size - overlap_size
  return chunks

def build_file_tree(paths):
    root = Node("root")
    nodes = {"": root}
    for path in sorted(paths):
        parts = path.strip("/").split("/")
        for i in range(len(parts)):
            subpath = "/".join(parts[:i+1])
            if subpath not in nodes:
                parent_path = "/".join(parts[:i])
                nodes[subpath] = Node(parts[i], parent=nodes[parent_path])
    return "\n".join(f"{'  ' * node.depth}- {node.name}" for node in root.descendants)


def preprocess_repository(repo_data, chunk_size=500, overlap_size=50, model_name="gpt-3.5-turbo"):
    processed_docs = []
    encoding = tiktoken.encoding_for_model(model_name)

    for file_info in repo_data["files"]:
        file_path = file_info["file_path"]
        content = file_info["content"]
        file_extension = os.path.splitext(file_path)[1]

        if file_extension in LANGUAGE_SPLITTER_MAP:
            splitter = RecursiveCharacterTextSplitter.from_language(
                language=LANGUAGE_SPLITTER_MAP[file_extension],
                chunk_size=chunk_size,
                chunk_overlap=overlap_size,
            )
            docs = splitter.create_documents([content], metadatas=[{"source": file_path, "type": file_extension}])
        else:
            encoded = encoding.encode(content)
            chunks = chunk_by_tokens(encoded, chunk_size, overlap_size)
            docs = [
                Document(
                    page_content=encoding.decode(chunk).strip(),
                    metadata={"source": file_path, "type": file_extension}
                )
                for chunk in chunks if chunk
            ]

        processed_docs.extend([d for d in docs if d.page_content.strip()])

    # Add tree diagram as document
    file_paths = [f["file_path"] for f in repo_data["files"]]
    tree_structure = build_file_tree(file_paths)
    structure_doc = Document(
    page_content=f"""Project File Structure Overview:
    The following is a hierarchical view of all files and folders in the repository:

    {tree_structure}

    Keywords: file structure, folder layout, project organization, directory structure
    """,
        metadata={"source": f"{repo_data['repo_name']}_structure"}
    )
    processed_docs.append(structure_doc)

    return processed_docs

### Interface code

In [5]:
import os
import re
import json
from urllib.parse import urlparse
from IPython.display import display, Markdown
from langchain_openai import ChatOpenAI, OpenAIEmbeddings, OpenAI
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from IPython.display import display, Markdown
from yaspin import yaspin
from yaspin.spinners import Spinners

def run_chat_interface(qa_chain):
    """Handles interactive Q&A session with the repository data"""
    chat_history = []
    print("\n🧠 Repo ChatBot ready. Type 'exit' to stop.\n")

    while True:
        try:
            question = input("❓ You: ")
            if question.lower() in ("exit", "quit"):
                print("👋 Chat ended.")
                break

            # Process the query
            with yaspin(Spinners.dots, text="🤖 Thinking...") as spinner:
                result = qa_chain.invoke({
                    "question": question,
                    "chat_history": chat_history
                })
                spinner.ok("✅")


            # Store conversation
            chat_history.append((question, result["answer"]))

            # Display formatted response
            display(Markdown(f"**🤖 Git Analyzer:** {result['answer']}"))

            if result['source_documents']:
                sources = "\n".join([f"- {doc.metadata['source']}" for doc in result['source_documents'][:4]])
                display(Markdown(f"**🔍 Sources:**\n{sources}"))

        except KeyboardInterrupt:
            print("\n🚨 Session interrupted")
            break
        except Exception as e:
            print(f"💥 Error: {str(e)}")
            break

def extract_repo_name(url):
    """Convert GitHub URL to 'owner/repo' format"""
    if url.startswith('git@github.com:'):
        return url[15:].replace('.git', '')

    parsed = urlparse(url)
    if parsed.netloc == 'github.com':
        path = parsed.path.strip('/')
        path = re.sub(r'\.git$', '', path)
        return '/'.join(path.split('/')[:2])
    raise ValueError("Invalid GitHub URL")

# Main execution flow
def git_analyzer():
    github_url = input("Enter GitHub repository URL: ")

    try:
        # Extract and process repo
        repo_name = extract_repo_name(github_url)

        fetcher = GitHubRepoFetcher()

        with yaspin(Spinners.dots, text="🔄 Preprocessing repository files...") as spinner:
            repo_data = fetcher.fetch_repo_structure(repo_name)
            spinner.ok("✅")

        if not repo_data:
            raise ValueError("Failed to process repository")

        # Save and reload data to ensure consistency
        fetcher.save_as_json(repo_data)
        with open("repo_contents.json") as f:
            repo_data = json.load(f)

        documents = preprocess_repository(repo_data)

        # 1. Set your OpenAI API key
        os.environ["OPENAI_API_KEY"] = userdata.get("API_KEY")

        # 2. Embed documents & save to FAISS index
        embedding = OpenAIEmbeddings()
        vectorstore = FAISS.from_documents(documents, embedding)
        vectorstore.save_local("faiss_index_repo")

        output_path = "/content/vectorstore_documents.txt"
        with open(output_path, "w", encoding="utf-8") as f:
          for doc in vectorstore.docstore._dict.values():
            source = doc.metadata.get("source", "N/A")
            f.write(f"Source: {source}\n")
            f.write(doc.page_content + "\n")
            f.write("-" * 40 + "\n")

        # 3. Load the retriever
        faiss_retriever = vectorstore.as_retriever()

        # Create keyword-based retriever (BM25)
        bm25 = BM25Retriever.from_documents(documents)
        bm25.k = 5

        retriever = EnsembleRetriever(
            retrievers=[faiss_retriever, bm25],
            weights=[0.7, 0.3]
        )

        custom_prompt_template = """
        You are Git Analyzer — an expert developer assistant designed to answer any question about a GitHub repository.

        You are provided with context retrieved from the repository, which may include code files, comments, documentation, file structures, and setup scripts.

        Your job is to analyze this context and answer the user’s question accurately, clearly, and concisely. Think like a senior engineer explaining things to a curious developer.

        If the answer requires referring to project structure, filenames, or functions, mention them explicitly and it should look readable."

        ---------------------
        {context}
        ---------------------

        Question: {question}

        Answer as a developer assistant:
        """

        # If you don't know the answer based on the given context, say "I couldn't find that information in the repository context.

        custom_prompt = PromptTemplate(
            input_variables=["context", "question"],
            template=custom_prompt_template,
        )

        llm = ChatOpenAI(
            temperature=0.3,
            model_name="gpt-4.1",
            max_tokens=1024
        )

        conversation_chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=retriever,
            combine_docs_chain_kwargs={"prompt": custom_prompt},
            return_source_documents=True
        )

        # Start chat interface
        run_chat_interface(conversation_chain)

    except Exception as e:
        print(f"💥 Critical error: {str(e)}")
        print("Possible fixes:")
        print("1. Check GitHub URL format")
        print("2. Verify repository exists and is accessible")
        print("3. Ensure valid PAT in Colab Secrets")


# GitAnalyzer

In [None]:
git_analyzer()

Enter GitHub repository URL: https://github.com/RahulSethi070801/Hybrid-Distributed-File-System
✅ 🔄 Preprocessing repository files...
✅ Cleaned structure saved to repo_contents.json

🧠 Repo ChatBot ready. Type 'exit' to stop.

❓ You: Can you show me full folder structure of the github repo?
✅ 🤖 Thinking...


**🤖 Git Analyzer:** Certainly! Here is the full folder (directory) structure of the GitHub repository, as provided:

```
- Cache.cpp
- Cache.h
- ConsistentHashRing.cpp
- ConsistentHashRing.h
- Daemon.cpp
- FileMetaData.cpp
- FileMetaData.h
- HyDFS.cpp
- HyDFS.h
- HyDFSMessage.cpp
- HyDFSMessage.h
- Makefile
- MembershipList.cpp
- Message.cpp
- Node.cpp
- Node.h
- README.md
- files
  - files
    - Rahuls-MacBook-Pro-2.local#4561#22:46:20
      - files
        - Rahuls-MacBook-Pro-2.local#4561#22:46:20
          - hydfs
            - files
              - Rahuls-MacBook-Pro-2.local#4561#22:46:20
                - hydfs
                  - aa$1
                  - aa$2
          - local
            - files
              - Rahuls-MacBook-Pro-2.local#4561#22:46:20
                - local
                  - b1.txt
                  - b2
    - introducer
      - files
        - introducer
          - hydfs
            - files
              - introducer
                - hydfs
                  - aa$1
                  - aa$2
          - local
            - files
              - introducer
                - local
                  - a.txt
    - remote
      - files
        - remote
          - local
            - files
              - remote
                - local
                  - appendFile
                  - appendFile40K
                  - business_1.txt
                  - business_10.txt
                  - business_11.txt
                  - business_12.txt
                  - business_13.txt
                  - business_14.txt
                  - business_15.txt
                  - business_16.txt
                  - business_17.txt
                  - business_18.txt
                  - business_19.txt
                  - business_2.txt
                  - business_20.txt
                  - business_3.txt
                  - business_4.txt
                  - business_5.txt
                  - business_6.txt
                  - business_7.txt
                  - business_8.txt
                  - business_9.txt
                  - foo1.txt
                  - foo2.txt
                  - local
- main.cpp
- scripts
  - scripts
    - cache_plot.py
    - cache_plot2.py
    - createTestLocalFiles.sh
    - generate_dataset.py
    - graph.py
    - init.sh
    - logvm.sh
    - lrun.sh
    - pb.sh
    - rrun.sh
    - run_introducer_locally.sh
    - run_nodes_locally.sh
    - script.py
    - stop_local_nodes.sh
    - vm.sh
    - vmi.sh
- utils.cpp
- utils.h
```

This structure includes all top-level source files, headers, the Makefile, a detailed breakdown of the nested `files` directory (with sample data and test files), and the `scripts/scripts` directory containing various Python and shell scripts for testing, plotting, and managing the distributed system.

**🔍 Sources:**
- RahulSethi070801/Hybrid-Distributed-File-System_structure
- scripts/scripts/pb.sh
- scripts/scripts/vm.sh
- README.md

❓ You: While file implements caching?
✅ 🤖 Thinking...


**🤖 Git Analyzer:** The caching functionality is implemented in the file named **cache.h**.

- The `Cache` class is defined in `cache.h`, and it manages the in-memory cache using a combination of `std::unordered_map` and `std::list` to store file contents and maintain cache order.
- Key methods such as `addFileToCache`, `getFileFromCache`, `invalidateFileInCache`, and `printCache` are declared in this file.
- The class uses a mutex (`mtx_cache`) to ensure thread safety for cache operations.

If you are looking for the implementation or logic related to caching, you should refer to **cache.h** (and likely its corresponding `.cpp` file, if present, for method definitions).

**🔍 Sources:**
- Cache.h
- Cache.cpp
- Cache.cpp
- HyDFS.cpp

❓ You: Can you show me implementation of above methods?
✅ 🤖 Thinking...


**🤖 Git Analyzer:** Certainly! Here’s what I found regarding the implementation of the methods you asked about from the Cache class:

### 1. `addFileToCache`
**Not present in the provided context.**  
The header declares:
```cpp
void addFileToCache(string &fileName, char *fileContent, size_t contentSize);
```
But the actual implementation is not included in the snippet you provided.

---

### 2. `getFileFromCache`
**Not present in the provided context.**  
The header declares:
```cpp
bool getFileFromCache(string &fileName, char *&fileData, size_t &contentSize);
```
But the implementation is also missing from the current context.

---

### 3. `invalidateFileInCache`
**Implementation found:**
```cpp
void Cache::invalidateFileInCache(string &fileName) {
    lock_guard<mutex> lock(mtx_cache);
    if (cacheMap.find(fileName) != cacheMap.end()) {
        delete[] cacheMap[fileName].first;
        cacheMap.erase(fileName);
        // cacheOrder.remove(fileName);
        cacheOrder.remove_if([&fileName](const string &item) { return item == fileName; });
        cout<<"File invalidated in cache: "<<fileName<<endl;
        printCache();
    }
}
```
**Explanation:**  
- Acquires a lock for thread safety.
- Checks if the file is in the cache.
- Deletes the cached file data, erases the entry from the map, and removes the filename from the order list.
- Prints a message and the current cache state.

---

### 4. `printCache`
**Implementation found:**
```cpp
void Cache::printCache() {
    cout<<"Cache order: ";
    for (auto it = cacheOrder.begin(); it != cacheOrder.end(); it++) {
        cout << *it << " ";
    }
    cout << endl;
    cout<<"Cache map: ";
    for (auto it = cacheMap.begin(); it != cacheMap.end(); it++) {
        cout << it->first << " ";
    }
    cout << endl;
}
```
**Explanation:**  
- Prints the current order of files in the cache.
- Prints the keys (file names) present in the cache map.

---

### Summary Table

| Method                   | Implementation Present? | Notes                                                                 |
|--------------------------|------------------------|-----------------------------------------------------------------------|
| addFileToCache           | ❌                     | Only declared in `Cache.h`, not implemented in provided context.      |
| getFileFromCache         | ❌                     | Only declared in `Cache.h`, not implemented in provided context.      |
| invalidateFileInCache    | ✅                     | Implementation shown above.                                           |
| printCache               | ✅                     | Implementation shown above.                                           |

If you need the implementations for `addFileToCache` or `getFileFromCache`, you may need to check other source files (e.g., `cache.cpp`) or provide more context.

Let me know if you want an explanation of the logic or need help finding the missing methods!

**🔍 Sources:**
- Cache.h
- Cache.cpp
- HyDFS.cpp
- Cache.cpp

❓ You: Which team won Fifa World Cup 2023?
✅ 🤖 Thinking...


**🤖 Git Analyzer:** Based on the provided repository context, there is no information related to the winner of the FIFA World Cup in 2023. The context primarily covers topics in distributed systems, telecommunications mergers, and related references, but does not mention any sports events or FIFA World Cup results.

If you need this information, you may want to consult an up-to-date sports or news source. As of my knowledge cutoff in June 2024, there was no FIFA Men's World Cup held in 2023; the most recent Men's World Cup was in 2022 (won by Argentina), and the Women's World Cup was held in 2023, won by Spain.

**🔍 Sources:**
- files/files/Rahuls-MacBook-Pro-2.local#4561#22:46:20/files/Rahuls-MacBook-Pro-2.local#4561#22:46:20/local/files/Rahuls-MacBook-Pro-2.local#4561#22:46:20/local/b1.txt
- HyDFS.cpp
- files/files/remote/files/remote/local/files/remote/local/business_16.txt
- files/files/remote/files/remote/local/files/remote/local/business_19.txt