## Importing Necessary Libraries

In [32]:
from git import Repo
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [18]:
#enter openai key
os.environ["OPENAI_API_KEY"] = ""

## Cloning the GitHub Repository

In [40]:
url = 'https://github.com/ShowRounak/YT-Comments-Sentiment-Analysis-Using-BERT.git'
current_path = os.getcwd()
print('current path',current_path)
last_name = url.split('/')[-1]
print('last name',last_name)
clone_path = last_name.split('.')[0]
print('clone path',clone_path)
repo_path = os.path.join(current_path,clone_path)
print('path',repo_path)

if not os.path.exists(repo_path):
    repo = Repo.clone_from(url, to_path=repo_path)

current path e:\GitHub Repo Chatbot\git code chatbot
last name YT-Comments-Sentiment-Analysis-Using-BERT.git
clone path YT-Comments-Sentiment-Analysis-Using-BERT
path e:\GitHub Repo Chatbot\git code chatbot\YT-Comments-Sentiment-Analysis-Using-BERT


## Extracting all the files

In [41]:
allowed_extensions = ['.py', '.ipynb', '.md']

def extract_all_files(repo_path):
        root_dir = repo_path
        docs = []
        for dirpath, dirnames, filenames in os.walk(root_dir):
            for file in filenames:
                file_extension = os.path.splitext(file)[1]
                if file_extension in allowed_extensions:
                    try: 
                        loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                        docs.extend(loader.load_and_split())
                    except Exception as e:
                        pass
        return docs

docs = extract_all_files(repo_path)

In [42]:
len(docs)

4

## Creating Embeddings

In [43]:
model_name = "all-MiniLM-L6-v2"
model_kwargs={'device': 'cpu'}


def chunk_files(docs):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
        texts = text_splitter.split_documents(docs)
        num_texts = len(texts)
        return texts

def create_embeddings(texts):
    embeddings = HuggingFaceEmbeddings(model_name= model_name,model_kwargs=model_kwargs)
    #embeddings= OpenAIEmbeddings()
    return embeddings

texts = chunk_files(docs)
embeddings = create_embeddings(texts)


In [50]:
chroma_path = f'{clone_path}-chroma'
print(chroma_path)

YT-Comments-Sentiment-Analysis-Using-BERT-chroma


In [51]:

def load_db(texts, embeddings):
    vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory=chroma_path)
    vectordb.persist()
    return vectordb

vectordb = load_db(texts, embeddings)

## Question-answering using GPT

In [52]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo")    

def retrieve_results(query,vectordb):
        memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)
        qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":8}), memory=memory)
        result = qa(query)
        return result['answer']

In [54]:
query = 'what are the requirements of this repository'
answer = retrieve_results(query,vectordb)

print(answer)

Number of requested results 20 is greater than number of elements in index 5, updating n_results = 5


The requirements for this repository are:
- Python 3.x
- Google Developer API Key
- TensorFlow or PyTorch (choose based on your preference)
