In [1]:
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os
import logging

CHROMA_PATH = "./data/text_db/chroma"
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
us_quest = "What is an example of a dataset with a non-Gaussian distribution?"

logging.basicConfig(level=logging.DEBUG)  # Nivel global
logger = logging.getLogger("chromadb")    # Logger específico para Chroma
logger.setLevel(logging.DEBUG)
load_dotenv(override=True)
groq_api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(model="llama3-8b-8192")



# def question_explanation(embeddings, current_question):
#     print("Loading documents...")
#     vector_store = Chroma(collection_name="doc_library", embedding_function=embeddings, persist_directory=CHROMA_PATH)
#     print("Searching for similar questions...")
#     results = vector_store.similarity_search(current_question, k=3)
#     print("Creating prompt and invoking model...")
#     prompt_template = ChatPromptTemplate.from_template("""
#     Answer the question based only on the following context:

#     {context}

#     ---

#     Answer the question based on the above context: {question}
#     """)
#     prompt = prompt_template.format(context=results, question=current_question)
#     response = model.invoke(prompt)
#     return response, results

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
expl, res = question_explanation(embeddings,us_quest)
print(expl)

Loading documents...
Searching for similar questions...
Creating prompt and invoking model...
content='According to the provided context, the example of a dataset with a non-Gaussian distribution is the "distribution of the household income in the USA".' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 30, 'prompt_tokens': 407, 'total_tokens': 437, 'completion_time': 0.025, 'prompt_time': 0.053937766, 'queue_time': 0.017868411, 'total_time': 0.078937766}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_a97cfe35ae', 'finish_reason': 'stop', 'logprobs': None} id='run-e733f533-caa7-4eeb-9dce-ab9970257dce-0' usage_metadata={'input_tokens': 407, 'output_tokens': 30, 'total_tokens': 437}


In [2]:
import question as qt
expl2, res = qt.question_explanation(embeddings,us_quest)
print(expl2)

Loading documents...
Searching for similar questions...
Creating prompt and invoking model...
According to the provided context, an example of a dataset with a non-Gaussian distribution is the distribution of household income in the USA.


: 

# 23/01

In [2]:
from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(model="embed-english-light-v3.0")
test_text = "This is a test sentence."
test_embedding = embeddings.embed_query(test_text)
print(test_embedding)

[-0.06976318, 0.04244995, 0.019241333, -0.012069702, 0.033081055, -0.021881104, 0.03274536, -0.07910156, 0.012863159, -0.029876709, 0.068847656, 0.014411926, -0.021392822, -0.0026931763, -0.024261475, 0.017150879, 0.036590576, -0.026947021, -0.04498291, -0.038848877, -0.00021123886, -0.003944397, -0.052459717, 0.05831909, -0.03994751, 0.04598999, -0.02305603, 0.029174805, -0.0048179626, 0.026397705, 0.033355713, -0.00019216537, 0.013908386, 0.011077881, 0.016662598, 0.029037476, -0.09295654, -0.07788086, 0.008094788, -0.03274536, 0.0019626617, 0.01374054, 0.057739258, -0.0049324036, -0.015457153, 0.09741211, 0.05807495, -0.012771606, 0.049224854, -0.048980713, -0.07775879, -0.14831543, -0.021453857, -0.068725586, 0.0042495728, 0.060150146, -0.08972168, 0.031677246, -0.024765015, -0.06414795, -0.10491943, 0.05230713, -0.017532349, 0.1138916, -0.020858765, -0.021224976, -0.012741089, -0.04815674, -0.046142578, 0.00034475327, -0.015777588, -0.08886719, 0.058898926, 0.113464355, 0.03698730

### Test with Pinecone DB

In [1]:
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_cohere import CohereEmbeddings
import time
import os
load_dotenv(override=True)
pinecone_api_key = os.getenv("PINECONE_API_KEY")
cohere_api_key = os.getenv("COHERE_API_KEY")
embeddings = CohereEmbeddings(model="embed-english-light-v3.0") 

pc = Pinecone(api_key=pinecone_api_key)
index_name = "quickstart"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

  from tqdm.autonotebook import tqdm


In [4]:
from langchain_pinecone import PineconeVectorStore
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index,embedding=embeddings)


In [5]:
import model as md
DATA_PATH = "data/text_db/raw"
chunks = md.load_split_documents()
vector_store.add_documents(documents=chunks)

['87617321-e9de-46b9-9392-cf6647e33e20',
 '9362485f-50a8-4089-83d8-fdd859b864f4',
 '6d2e2257-161d-44bb-8cdf-e5b8d99a1bd4',
 '0a88e9ba-72ff-4106-a7ed-2632b7af27a1',
 '80b48b22-0e0e-47e8-9f76-34783b51a40b',
 '2c88ecc9-da6d-4649-9755-dba5b6929cf4',
 '71b8c92d-93db-4dc4-8d3a-b05c06e7258f',
 '0fd29a44-9425-4c68-bf23-86c5055b184a',
 '42847fea-1ee3-4fe3-a442-a59899adaf98',
 'e4cac99a-bea2-4db9-af3f-6a604ba5f9bd',
 '26931a4a-6eb3-42f2-9ba3-34e5bfa8af07',
 'cbfa05ed-d925-4568-a98b-f06e56b8c185',
 '641ff9bc-b6bf-4367-8980-05e5e226b7d8',
 'fc5d050a-4193-478b-8c6d-9779b3d6df64',
 'bac744d1-a032-41b8-860e-561907be8fa7',
 'ee0629b1-6fd5-420b-bbff-5bf498504374',
 'e54c7c17-da5e-4931-ba8c-91d27f8c2397',
 '2d4e76ec-c5ae-4147-8dd8-b9656bf223a5',
 '24d40324-d20a-4642-9ca3-ab140db4a90c',
 'f166fd3a-c81e-4cd4-ba1c-7bf1ce1165a4',
 '3f7c1317-a99f-47e9-9498-fba9b64082aa',
 '08fdc598-3013-4895-a63a-42d9ad5ee8c8',
 'acec016a-796c-4524-9a30-8a75d40b6023',
 '9715e4c4-7564-4e3d-8dd2-aff144b02fc2',
 '8c6bc4a7-61e3-

In [6]:
results = vector_store.similarity_search(query="What is Data Science? List the di􀃠erences between supervised and unsupervised learning.",k=3)

In [7]:
results

[Document(id='29b9c561-ab12-458b-82ca-edd70dba9bdf', metadata={'source': 'data\\text_db\\raw\\100_ds_interview_questions.pdf'}, page_content='12/35\n\n20/4/2020\n\n100+ Data Science Interview Questions and Answers For 2020 | Edureka\n\nQ41. What is Supervised Learning?\n\nSupervised learning is the machine learning task of inferring a function from labeled training data. The training data consist of a\n\nset of training examples.'),
 Document(id='71b8c92d-93db-4dc4-8d3a-b05c06e7258f', metadata={'source': 'data\\text_db\\raw\\100_ds_interview_questions.pdf'}, page_content='Data Science is a blend of various tools, algorithms, and machine learning principles with the goal to discover hidden patterns\n\nfrom the raw data. How is this di\x00erent from what statisticians have been doing for years?\n\nThe answer lies in the di\x00erence between explaining and predicting.'),
 Document(id='2a8ef674-1a95-4813-9a1e-694def96a310', metadata={'source': 'data\\text_db\\raw\\ds_interview_questions.pd

### Test with change already made on md

In [1]:
import model as md
from dotenv import load_dotenv
import os
from langchain_cohere import CohereEmbeddings


load_dotenv(override=True)
cohere_api_key = os.getenv("COHERE_API_KEY")
embeddings = CohereEmbeddings(model="embed-english-light-v3.0")
us_quest = "What is an example of a dataset with a non-Gaussian distribution?"
expl, res = md.question_explanation(embeddings,us_quest)
print(expl)

  from tqdm.autonotebook import tqdm


Loading documents...
Searching for similar questions...
Creating prompt and invoking model...
[32;1m[1;3m[llm/start][0m [1m[llm:ChatGroq] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: Human: \n    Answer the question based only on the following context:\n\n    [Document(id='b1c218ed-bf31-4092-a7b2-33f94833be08', metadata={'source': 'data\\\\text_db\\\\raw\\\\ds_interview_questions.pdf'}, page_content='tion are located within 2 standard deviations from\\n\\nthe mean, and 99.7% of the data points are located\\n\\nwithin 3 standard deviations from the mean.\\n\\n9. What is an example of a non-Gaussian distribution? First, it may make sense to research what is a\\n\\ndataset with a'), Document(id='21aec787-de65-4a91-94bd-c746bbddb3f7', metadata={'source': 'data\\\\text_db\\\\raw\\\\ds_interview_questions.pdf'}, page_content='dataset with a\\n\\nNow, you may be also expected to give an exam-\\n\\nGaussian distribution. In fact, it is also known as ‘Nor-\\n\\nple.\\n\\nma