In [1]:
!pip install langchain chromadb langchain-chroma langchain-huggingface langchain-groq





In [2]:
from langchain_groq import ChatGroq
from langchain.vectorstores import Chroma
from langchain.prompts.chat import HumanMessagePromptTemplate
from langchain.chains import RetrievalQA
from langchain.schema import SystemMessage
from langchain_text_splitters import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
import os


In [3]:
os.environ["GROQ_API_KEY"] = ""

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0
)

In [4]:
pdf_file_path = r"C:\Users\sarfarazuddin.s\Downloads\The Data Science Handbook.pdf"
if not os.path.exists(pdf_file_path):
    raise FileNotFoundError(f"The file {pdf_file_path} does not exist.")

# Load and split the PDF document into chunks
document_loader = PyPDFLoader(pdf_file_path)
# documents = document_loader.load_and_split()
documents = document_loader.load()

In [5]:
documents

[Document(metadata={'source': 'C:\\Users\\sarfarazuddin.s\\Downloads\\The Data Science Handbook.pdf', 'page': 0}, page_content='THE\nDATA SCIENCE  \nHANDBOOK\nADVICE A ND I NSIGHTS FROM   \n25 AMAZING DA TA SCIEN TISTS\nDJ Patil, Hilary Mason, Pete Skomoroch, Riley Newman, Jonathan Goldman, Michael Hochster, \n George Roumeliotis, Kevin Novak, Jace Kohlmeier, Chris Moody, Erich Owens, Luis Sanchez, \n Eithon Cadag, Sean Gourley, Clare Corthell, Diane Wu, Joe Blitzstein, Josh Wills, Bradley Voytek, \n Michelangelo D’Agostino, Mike Dewar, Kunal Punera, William Chen, John Foreman, Drew Conway\nBY M AX  S ONGHE N RY  W ANG WI LL IA M  C HENCA R L  SHAN\nFOREWORD BY JAKE K LAMKA\nSold to\nmenogetusername@gmail.com\n'),
 Document(metadata={'source': 'C:\\Users\\sarfarazuddin.s\\Downloads\\The Data Science Handbook.pdf', 'page': 1}, page_content='To our family, friends and mentors. \nYour support and encouragement is the fuel for our fire.\n'),
 Document(metadata={'source': 'C:\\Users\\sarfar

In [6]:
text_splitter = CharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=500
)

text_chunks = text_splitter.split_documents(documents)

In [7]:
text_chunks

[Document(metadata={'source': 'C:\\Users\\sarfarazuddin.s\\Downloads\\The Data Science Handbook.pdf', 'page': 0}, page_content='THE\nDATA SCIENCE  \nHANDBOOK\nADVICE A ND I NSIGHTS FROM   \n25 AMAZING DA TA SCIEN TISTS\nDJ Patil, Hilary Mason, Pete Skomoroch, Riley Newman, Jonathan Goldman, Michael Hochster, \n George Roumeliotis, Kevin Novak, Jace Kohlmeier, Chris Moody, Erich Owens, Luis Sanchez, \n Eithon Cadag, Sean Gourley, Clare Corthell, Diane Wu, Joe Blitzstein, Josh Wills, Bradley Voytek, \n Michelangelo D’Agostino, Mike Dewar, Kunal Punera, William Chen, John Foreman, Drew Conway\nBY M AX  S ONGHE N RY  W ANG WI LL IA M  C HENCA R L  SHAN\nFOREWORD BY JAKE K LAMKA\nSold to\nmenogetusername@gmail.com'),
 Document(metadata={'source': 'C:\\Users\\sarfarazuddin.s\\Downloads\\The Data Science Handbook.pdf', 'page': 1}, page_content='To our family, friends and mentors. \nYour support and encouragement is the fuel for our fire.'),
 Document(metadata={'source': 'C:\\Users\\sarfarazud

In [8]:
len(text_chunks)

285

In [9]:
embeddings = HuggingFaceEmbeddings(model_name="sangmini/msmarco-cotmae-MiniLM-L12_en-ko-ja")

In [10]:
# Initialize Chroma vector database
persist_directory = r"D:\vectorstore"  # Replace with your actual persistence directory path
db = Chroma.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [11]:
# Set up Chroma retriever
retriever = db.as_retriever()

# Initialize the RetrievalQA chain with your LLM and the Chroma retriever
db_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can use 'stuff', 'map_reduce', or 'refine' depending on your needs
    retriever=retriever
)

In [12]:
def retrieve_from_db(query: str) -> str:
    try:
        db_context = db_chain(query)
        db_context = db_context['result'].strip()
        # print(f"Debug: SQL Query Result - {db_context}")
        return db_context
    except Exception as e:
        print(f"Error: {e}")
        return "An error occurred while retrieving data from the database."

def generate(query: str) -> str:
    db_context = retrieve_from_db(query)
    
    system_message = system_message = """You are a professional representative of The Data Science Handbook.
        Answer questions related to data science topics, methodologies, and insights from the book. Provide concise and accurate information based on the content of the book.
        If question is asked which is not related to Data Science dont give answer and reply that you are not allowed to give the answers which are not related
        to Data Science.
        """

    
    human_qry_template = HumanMessagePromptTemplate.from_template(
        """Input:
        {human_input}
        
        Context:
        {db_context}
        
        Output:
        """
    )
    messages = [
        SystemMessage(content=system_message),
        human_qry_template.format(human_input=query, db_context=db_context)
    ]
    try:
        # Get the response from the LLM using the invoke method
        response = llm.invoke(messages).content
        return response
    except Exception as e: 
        print(f"Error: {e}")
        return "An error occurred while generating the response."

# Example of usage
query_input = input("Enter your query: ")

response_data = generate(query_input)
print(f"Response: {response_data}")


#     try:
#         response = llm.invoke(messages).content
#         print(f"LLM Response - {response}")
#         return response
#     except Exception as e: 
#         print(f"Error: {e}")
#         return "An error occurred while generating the response."

# # Example of usage
# query_input = input("Enter your query: ")

# if query_input.lower() in ['hi', 'hello', 'hey']:
#     print("Hello! How can I assist you with information about data science?")
# else:
#     response_data = generate(query_input)
#     print(f"Response: {response_data}")

Enter your query:  What is Data Science


  db_context = db_chain(query)


Response: According to The Data Science Handbook, Data Science is a unique discipline that combines elements of applied mathematics, computer science, business consulting, and new product development. It is considered a permanent feature of the business landscape, requiring a strong foundation in advanced statistics, machine learning, SQL, Hadoop, and a mainstream programming language like Java. Data Science is not just about analyzing data, but also about being able to communicate effectively between business and technical domains, and serving as a trusted advisor to business leaders.


In [13]:
# Example of usage
query_input = input("Enter your query: ")

response_data = generate(query_input)
print(f"Response: {response_data}")


Enter your query:  What is Physics


Response: I am not allowed to give answers which are not related to Data Science.
