In [1]:
import os
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
CHROMA_DIR = "*Modify to Local VectorDB Path*"
COLLECTION_NAME = "RAG_DB"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
OLLAMA_MODEL_NAME = "llama3"

In [3]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=CHROMA_DIR  
)
retriever = vectorstore.as_retriever(k=3)

llm = Ollama(model=OLLAMA_MODEL_NAME)

  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
  vectorstore = Chroma(
  llm = Ollama(model=OLLAMA_MODEL_NAME)


In [4]:
def format_docs(docs: list[Document]) -> str:
    return "\n\n---\n\n".join(doc.page_content for doc in docs)

prompt = ChatPromptTemplate.from_template("""
Role:
You are an expert data-science tutor specializing in:
Statistics
Algorithms
Python for data science
Data visualization
You have access only to the provided resources, which may include:
Textbooks on statistics, algorithms, Python, and data visualization
Articles on becoming a data scientist and textbooks on how to become a data scientist.
Instructions:
Answer only using the information found in the provided context.
If the user asks a question that is not fully supported by the context, respond with:
“I’m sorry, but I cannot answer that based on the provided context.”
When the user asks about technical topics (statistics, algorithms, Python, data visualization, workflows, etc.), search the textbooks provided in the context for the relevant material.
Do not use outside knowledge, assumptions, or extrapolation.
If the context contains partial information, answer using only what is given and clearly state any limitations.
Provide clear, accurate, step-by-step explanations strictly grounded in the supplied resources.
Your goal:
Teach the user the necessary skills for data science — including statistics, algorithms, Python, and data visualization — using only the information contained in the provided context.
Context: {context}
Question: {question}
""")

rag_chain = (
    # Step 1: Retrieve context and pass the question
    {"context": retriever | format_docs, 
     "question": RunnablePassthrough()}
    # Step 2: Pass to Prompt
    | prompt
    # Step 3: Pass to LLM
    | llm
    # Step 4: Parse output
    | StrOutputParser()
)

In [None]:
query = "Type Query"

In [9]:
final_answer = rag_chain.invoke(query)

In [10]:
print(final_answer)

I'd be happy to help!

Based on the provided context, here are some potential interview question topics related to probability and statistics:

Probability:

* Conditional probability questions (e.g., "What is the probability that event A occurs given that event B has occurred?")
* Basic applications involving PDFs of various probability distributions (e.g., "What is the probability that a random variable X follows a normal distribution with mean μ and standard deviation σ?")
* Expected value calculations
* Betting decisions

Statistics:

* Central Limit Theorem-related questions (e.g., "Under what conditions does the CLT hold?")
* Law of Large Numbers-related questions (e.g., "What is the probability that the average of i.i.d. random variables converges to the population mean as the sample size increases?")
* Hypothesis-testing-related questions (e.g., "What is the difference between a Type I and Type II error in hypothesis testing?")
* Confidence interval calculations
* MLE/MAP-relat