In [None]:
#pip install "unstructured[pdf]"

In [2]:
import argparse
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import getpass
import os
import model as md
from dotenv import load_dotenv, find_dotenv

In [3]:
load_dotenv(override=True)
DATA_PATH = "data/text_db/raw"
CHROMA_PATH = "data/text_db/chroma"
chroma_db = None
groq_api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(model="llama3-8b-8192")

### TODO: Some warnings from here and tensorflow usage. Check if something else can be done.
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

### TODO: It doesn't read very well some characters. Try another pdf reader, such as llamaparse, pypdf2 or pdfplumber.
def load_split_documents():
    text_loader_kwargs={'autodetect_encoding': True}
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_kwargs=text_loader_kwargs)
    chunks = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500, add_start_index=True)) ### We do the splitting in the same def as the loading.
    return chunks

  from .autonotebook import tqdm as notebook_tqdm





In [4]:
chunks = load_split_documents()
chunks[100].page_content

'Iteration – if we have 10,000 images as data and a batch size of 200. then an epoch should run 50 iterations (10,000 divided\n\nby 50).\n\nQ80. What Are the Di\x00erent Layers on CNN?\n\nThere are four layers in CNN:\n\n1. Convolutional Layer – the layer that performs a convolutional operation, creating several smaller picture windows to go\n\nover the data.\n\n2. ReLU Layer – it brings non-linearity to the network and converts all the negative pixels to zero. The output is a recti\x00ed\n\nfeature map.\n\n3. Pooling Layer – pooling is a down-sampling operation that reduces the dimensionality of the feature map.\n\n4. Fully Connected Layer – this layer recognizes and classi\x00es the objects in the image.\n\nQ81. What Is Pooling on CNN, and How Does It Work?\n\nPooling is used to reduce the spatial dimensions of a CNN. It performs down-sampling operations to reduce the dimensionality\n\nand creates a pooled feature map by sliding a \x00lter matrix over the input matrix.\n\nQ82. What a

In [5]:
def chroma_create(chunks, embeddings):
    print("Creating Chroma database...")
    chroma_db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
    print(f"Saved {len(chunks)} to {CHROMA_PATH}")
    return chroma_db

In [12]:
os.path.isdir("data\\text_db\\chroma\\chroma.sqlite3")

False

In [13]:
os.path.exists(CHROMA_PATH+'/chroma.sqlite3')

True

In [14]:
def chroma_read(chunks, embeddings):
    if os.path.exists(CHROMA_PATH+'/chroma.sqlite3') == True:
        print("Local Chroma DB found. Reading Chroma database...")
        chroma_db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
        print("Loaded Chroma DB from disk.")
    else:
        print("Local Chroma DB not found. Creating Chroma database...")
        chroma_db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
        print(f"Saved {len(chunks)} to {CHROMA_PATH}")
    return chroma_db

In [15]:
chroma_db = chroma_read(chunks, embeddings)

Local Chroma DB found. Reading Chroma database...
Loaded Chroma DB from disk.


In [None]:
query_text = "What is Data Science? List the di􀃠erences between supervised and unsupervised learning." ### TODO: Change when turning into a function.
results = chroma_db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.5: ### Change for more specific inputs
    print(f"Unable to find matching results for query: {query_text}")
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
print(query_text)
print("="*50)
print(context_text)

In [8]:
results

[(Document(metadata={'source': 'data\\text_db\\raw\\100_ds_interview_questions.pdf', 'start_index': 1060}, page_content='Deep Learning Interview Questions\n\nBefore moving ahead, you may go through the recording of Data Science Interview Questions where our instructor has shared\n\nhis experience and expertise that will help you to crack any Data Science.\n\nData Science Interview Questions | Edureka\n\nData Science Interview Questions | Data Science Tutorial | Data Science I Data Science Interview Questions | Data Science Tutorial | Data Science I\n\nBASIC DATA SCIENCE INTERVIEW QUESTIONS\n\nQ1. What is Data Science? List the di\x00erences between supervised and unsupervised learning.\n\nData Science is a blend of various tools, algorithms, and machine learning principles with the goal to discover hidden patterns\n\nfrom the raw data. How is this di\x00erent from what statisticians have been doing for years?\n\nThe answer lies in the di\x00erence between explaining and predicting.\n\n

In [19]:
Model = ChatGroq(model="llama3-8b-8192")
prompt_template = ChatPromptTemplate.from_template("""
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
""")
prompt = prompt_template.format(context=context_text, question=query_text)
response = model.invoke(prompt)
print("Question: ",query_text)
print("-"*10)
print(response.content)

Question:  What is Data Science? List the di􀃠erences between supervised and unsupervised learning.
----------
Data Science is a blend of various tools, algorithms, and machine learning principles with the goal to discover hidden patterns from the raw data. The differences between supervised and unsupervised learning are as follows:

* Supervised Learning:
	+ Input data is labelled.
	+ Uses a training data set.
	+ Used for prediction.
	+ Enables classification and regression.
* Unsupervised Learning:
	+ Input data is unlabelled.
	+ Uses the input data set.
	+ Used for analysis.
	+ Enables classification, density estimation, and dimension reduction.


# Pruebas desde model.py

In [1]:
import model as md

In [2]:
chroma_db = md.chroma_read()

Setting up embeddings...


  from .autonotebook import tqdm as notebook_tqdm



Local Chroma DB found. Reading Chroma database...
Loaded Chroma DB from disk.
