In [None]:
%cd ..

In [None]:
from haystack.telemetry import tutorial_running

tutorial_running(27)

In [None]:
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

In [None]:
from datasets import load_dataset
from haystack import Document

dataset = load_dataset("bilgeyucel/seven-wonders", split="train")

## Compare to whale data

In [None]:
item = next(iter(dataset))
item

In [None]:
lengths = [len(item['content']) for item in dataset]

In [None]:
import pandas as pd
df = pd.DataFrame({"lengths": lengths}).describe()

In [None]:
from src.scrapeWhales import *
data = get_data("https://en.wikipedia.org/wiki/Whale")
myLengths = [len(cont) for cont in sum(data.values(), [])]

In [None]:
mydf = pd.DataFrame({"myLengths": myLengths}).describe()
pd.concat([df, mydf], axis=1)

In [None]:
print(data['Whale'][1])

## Resume

In [None]:
from src.scrapeWhales import *
import numpy as np
url = "https://en.wikipedia.org/wiki/Whale" 
data = get_data(url)
data = [ [(heading, content) for content in contents] for heading, contents in data.items()]
data = sum(data, [])
docs = [Document(content=txt, meta={'url': url, 'title':heading}) for heading, txt in data]

In [None]:
len(docs), docs[0]

In [None]:
from dotenv import load_dotenv
import os

load_dotenv(".envargs")
#os.getenv("OPENAI_API_KEY")

from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack.utils import Secret
doc_embedder = OpenAIDocumentEmbedder(api_key=Secret.from_token(os.getenv("OPENAI_API_KEY")))

In [None]:
docs_with_embeddings = doc_embedder.run(docs)
document_store.write_documents(docs_with_embeddings["documents"])

In [None]:
from haystack.components.embedders import OpenAITextEmbedder

# text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(os.getenv("OPENAI_API_KEY")))
text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(os.getenv("OPENAI_API_KEY")))
# text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(os.getenv("OPENAI_API_KEY")))

from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

retriever = InMemoryEmbeddingRetriever(document_store)

In [None]:
from haystack.components.builders import PromptBuilder

template = """
Given the following information, answer the question.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template)

In [None]:
from haystack.components.generators import OpenAIGenerator
generator = OpenAIGenerator(model="gpt-4o-mini")

In [None]:
from haystack import Pipeline

basic_rag_pipeline = Pipeline()
# Add components to your pipeline
basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", generator)

# Now, connect the components to each other
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm")

In [None]:
question = "Are whale arteries larrge enough for a baby to crawl through?"

response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

In [None]:
text_embedder.run(question)

In [None]:
print("end=")

In [None]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
# import logging
# logging.basicConfig(level=logging.INFO)

text_to_embed = "I love pizza!"

print("Creating Embedded")
text_embedder = SentenceTransformersTextEmbedder()
print("Warming up embedded")
text_embedder.warm_up()

print("running embedded")
print(text_embedder.run(text_to_embed))

# {'embedding': [-0.07804739475250244, 0.1498992145061493,, ...]}

In [None]:
from haystack.components.embedders import OpenAITextEmbedder

text_to_embed = "I love pizza!"

text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(os.getenv("OPENAI_API_KEY")))

print(text_embedder.run(text_to_embed))

# {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
# 'meta': {'model': 'text-embedding-ada-002-v2',
#              'usage': {'prompt_tokens': 4, 'total_tokens': 4}}}