In [1]:
import pandas as pd
import os
import dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [2]:
dotenv.load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [4]:
file_path = r"C:\Users\User\Downloads\sid_kosti\GenAI\iNeuron\GenAI-Doc2QA\data\Siddhartha_Kosti_LLM_Resume.pdf"
loader = PyPDFLoader(file_path)
document = loader.load()

In [5]:
question_gen = ""
for doc in document:
    question_gen += doc.page_content

In [6]:
splitter_ques_gen = TokenTextSplitter(
    model_name="gpt-4o-mini",
    chunk_size=10000,
    chunk_overlap=200
)

In [7]:
chunk_ques_gen = splitter_ques_gen.split_text(question_gen)

In [8]:
document_ques_gen = [Document(item) for item in chunk_ques_gen]

In [9]:
splitter_ans_gen = TokenTextSplitter(
    model_name = 'gpt-4o-mini',
    chunk_size = 1000,
    chunk_overlap = 100
)

In [10]:
document_answer_gen = splitter_ans_gen.split_documents(
    document_ques_gen
)

In [11]:
llm_ques_gen_pipeline = ChatOpenAI(
    model = 'gpt-4o-mini',
    temperature = 0.3
)

  warn_deprecated(


In [12]:
prompt_template = """
You are an expert at creating questions based on coding materials and documentation.
Your goal is to prepare a coder or programmer for their exam and coding tests.
You do this by asking questions about the text below:

------------
{text}
------------

Create questions that will prepare the coders or programmers for their tests.
Make sure not to lose any important information.
Only create 5 questions.

QUESTIONS:
"""

In [13]:
PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=['text'])

In [14]:
refine_template = ("""
You are an expert at creating practice questions based on coding material and documentation.
Your goal is to help a coder or programmer prepare for a coding test.
We have received some practice questions to a certain extent: {existing_answer}.
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.
QUESTIONS:
"""
)

In [15]:
REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

In [16]:
ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline, 
                                          chain_type = "refine", 
                                          verbose = True, 
                                          question_prompt=PROMPT_QUESTIONS, 
                                          refine_prompt=REFINE_PROMPT_QUESTIONS)

In [17]:
ques = ques_gen_chain.run(document_ques_gen)

  warn_deprecated(




[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert at creating questions based on coding materials and documentation.
Your goal is to prepare a coder or programmer for their exam and coding tests.
You do this by asking questions about the text below:

------------
[Type text]  
 
 
 
 
 
Projects  
 
 LLM RAG Chatbot for Hospital system  
 Technologies used: LangChain, Neo4j AuraDB , Agents, 
Docker, Prompt Engineering , Streamlit , Litellm . 
 Integrated LangChain and Neo4j AuraDB for data 
processing and storage . 
 Developed Agents and customized prompts for text -to-
SQL functionality . 
 Dockerized the application for scalability and 
deployment.  
 Utilized litellm and groq -API for cost elimination  
 Designed a user interfac e with Streamlit . 
 
 LLM Pipeline  evaluation  tests  
 Hallucination and Faithfulness test  
 Contextual relevancy test  
 Readability & Di

In [18]:
embeddings = OpenAIEmbeddings()

  warn_deprecated(


In [19]:
vector_store = FAISS.from_documents(document_answer_gen, embeddings)

In [20]:
llm_answer_gen = ChatOpenAI(temperature=0.1, model="gpt-4o-mini")

In [21]:
ques_list = ques.split("\n")

In [22]:
ques_list

['1. **Technologies and Integration**: Describe how LangChain and Neo4j AuraDB were integrated in the LLM RAG Chatbot project. What role did each technology play in data processing and storage?',
 '',
 '2. **Testing Methodologies**: Outline the different evaluation tests conducted for the LLM Pipeline. What are the objectives of the Hallucination and Faithfulness test, and how do they contribute to the overall assessment of the model?',
 '',
 '3. **Model Training Techniques**: Explain the concepts of transfer learning and fine-tuning as utilized in model training on a custom dataset. How does the Active Learning approach help achieve similar performance with a smaller dataset?',
 '',
 '4. **Clustering and Analysis**: In the context of clustering, what is the purpose of using K-Means for the train and test dataset? Additionally, how does the Segment Anything Model (SAM) assist in identifying labeling consistency issues?',
 '',
 '5. **Dashboard Development**: What are the key features of

In [23]:
from langchain.chains import RetrievalQA

In [24]:
answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen, 
                                               chain_type="stuff", 
                                               retriever=vector_store.as_retriever())

In [25]:
for question in ques_list:
    print("Question: ", question)
    answer = answer_generation_chain.run(question)
    print("Answer: ", answer)
    print("--------------------------------------------------\\n\\n")
    # Save answer to file
    with open("answers.txt", "a") as f:
        f.write("Question: " + question + "\\n")
        f.write("Answer: " + answer + "\\n")
        f.write("--------------------------------------------------\\n\\n")

Question:  1. **Technologies and Integration**: Describe how LangChain and Neo4j AuraDB were integrated in the LLM RAG Chatbot project. What role did each technology play in data processing and storage?
Answer:  In the LLM RAG Chatbot project, LangChain and Neo4j AuraDB were integrated to facilitate data processing and storage. 

- **LangChain**: This technology was utilized for building the chatbot's functionality, particularly in handling language models and managing the interactions between the user and the chatbot. It likely played a crucial role in processing user queries and generating responses based on the underlying data.

- **Neo4j AuraDB**: This is a graph database service that was used for data storage. It allowed for efficient storage and retrieval of data, particularly in scenarios where relationships between data points are important. In the context of the chatbot, Neo4j AuraDB would have been used to store structured data that the chatbot could query to provide relevant