In [1]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import sys

In [None]:
def ingest():
    loader = PyPDFLoader(r"input_docs/Chip Huyen - Designing Machine Learning Systems_ An Iterative Process for Production-Ready Applications (2022, O'Reilly Media) - libgen.li.pdf")
    pages = loader.load_and_split()
   
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")
   
    embedding = FastEmbedEmbeddings()
    
    Chroma.from_documents(documents=chunks,  embedding=embedding, persist_directory="./sql_chroma_db3")

#Chroma, fastembed, text_splitter

In [25]:
ingest()

Split 385 documents into 1075 chunks.


In [None]:
from huggingface_hub import login
access_token_read = "your_token"
access_token_write = "your_token"
login(token = access_token_read)

In [10]:

def rag_chain():
    model = ChatOllama(model="llama3")
    
    
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context and try to search the context as much as you can. 
        If you don't know the answer, then reply, No Context available for this question {input}. [/Instructions] </s> 
        [Instructions] Question: {input} 
        Context: {context} 
        Answer: [/Instructions]
        """
    )
    
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db3", embedding_function=embedding)

    
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.3,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    
    return chain

In [11]:
def ask(query: str):
    
    chain = rag_chain()
    
    result = chain.invoke({"input": query})
    
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

In [16]:
ask("Prepare some questions for Chapter 5 Feature Engineering")

Based on the provided context, I'll prepare some questions for Chapter 5 Feature Engineering. Here are a few:

1. What are some key aspects of feature engineering that will be covered in this chapter?
2. How do multiple models sharing a feature affect its computation, and what are some considerations to take into account?
3. What is data leakage, and how can it be detected and avoided in the context of feature engineering?
4. How do feature stores fit into the broader landscape of machine learning applications, and when will they be discussed further?
5. What are the two aspects that will be focused on in this chapter regarding extracting features from raw data for input into an ML model?

Let me know if you'd like me to help with anything else!
Source:  input_docs/Chip Huyen - Designing Machine Learning Systems_ An Iterative Process for Production-Ready Applications (2022, O'Reilly Media) - libgen.li.pdf
Source:  input_docs/Chip Huyen - Designing Machine Learning Systems_ An Iterative

In [17]:
ask("Give me Key concepts for each Chapters")

I'd be happy to help! Since the context provides information about the book's chapters and contents, I can give you a summary of the key concepts for each chapter based on what's available.

Unfortunately, there is no Epilogue chapter mentioned in the provided context. Therefore, I won't include it in my answer.

Here are some key concepts for each chapter:

* Chapters related to Statistical concepts:
	+ Variance
	+ Probability
	+ Normal/Long-tail distribution

Note that these chapters likely cover the basics of statistical concepts as they relate to machine learning.

* Machine Learning (ML) tasks and concepts:
	+ Language modeling
	+ Anomaly detection
	+ Object classification
	+ Machine translation

These chapters will likely delve into the various ML algorithms and techniques used for these tasks, along with their applications and limitations.

Keep in mind that this is just a rough summary based on the provided context. For more detailed information about each chapter's key concept