In [2]:
import os

In [3]:
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

# Clone the Github Repo

In [4]:
%pwd

'/Users/subhasish/Documents/iNeuron/GenAI/Codebase-Analysis-using-GenAI/research'

In [5]:
!mkdir test_repo

In [6]:
repo_path = "test_repo/"

Repo.clone_from("https://github.com/Subhasish-Saha/Student-Performance-Indicator", to_path=repo_path)

<git.repo.base.Repo '/Users/subhasish/Documents/iNeuron/GenAI/Codebase-Analysis-using-GenAI/research/test_repo/.git'>

In [7]:
repo_path = "test_repo/"

loader = GenericLoader.from_filesystem(repo_path+'/src',
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [8]:
documents = loader.load()

In [9]:
documents

[Document(page_content="import sys \nfrom src.logger import logging\n\ndef error_message_detail(error, error_detail:sys):\n\n    _,_, exc_tb = error_detail.exc_info()\n    filename = exc_tb.tb_frame.f_code.co_filename\n    line_no = exc_tb.tb_lineno\n    error_message = f'Error occured in the python script named : {filename} , at line no. : {line_no} , error details : {error} '\n\n    return error_message\n\nclass Custom_Exception(Exception):\n\n    def __init__(self, error_message, error_detail:sys):\n        super().__init__(error_message)\n        self.error_message = error_message_detail(\n          error= error_message,\n          error_detail=error_detail  \n        )\n\n    def __str__(self) -> str:\n        return self.error_message", metadata={'source': 'test_repo/src/exception.py', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='', metadata={'source': 'test_repo/src/__init__.py', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='import os\n

# Chunkings

In [10]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 2000,
                                                             chunk_overlap = 200)

In [11]:
texts = documents_splitter.split_documents(documents)

In [12]:
len(texts)

20

# Embedding model

In [15]:
import os
from dotenv import load_dotenv
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

In [18]:
embeddings=OpenAIEmbeddings(disallowed_special=())
# disallowed special removes the special characters

# knowledge base (Chroma DB)

In [19]:
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./data')
vectordb.persist()

# LLM Wrapper

In [20]:
# llm = ChatOpenAI(model_name="gpt-4")
llm = ChatOpenAI()

In [21]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [22]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":3}), memory=memory)

# Q&A

In [23]:
question = "what is DataIngestion class?"

In [24]:
result = qa(question)
print(result['answer'])

The purpose of the DataIngestion class is to read raw data from a specific data source, split it into a train set and test set, save them as CSV files, and return the file paths to the train and test data. The class also handles exceptions during data ingestion by raising a Custom_Exception if an error occurs.
