In [1]:
# Magic commands
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [1]:
import os
import nest_asyncio
import gradio as gr
import pandas as pd
import frontmatter
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings, SimpleDirectoryReader, Document, VectorStoreIndex, get_response_synthesizer, PromptTemplate
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter, MarkdownNodeParser
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from IPython.display import display, Markdown

  from .autonotebook import tqdm as notebook_tqdm


Load environment variables

In [2]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]


Initialize Pinecone and OpenAI clients

In [3]:
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)
client = OpenAI(
    # Defaults to os.environ.get("OPENAI_API_KEY")
model="gpt-4o-mini", temperature=0)

Set up global settings for LLM and embedding models

In [4]:
embedding = OpenAIEmbedding(model="text-embedding-ada-002")
Settings.llm = client
Settings.embed_model = embedding
Settings.chunk_size_limit = 1536 

Enable nested asyncio for event loop handling

In [5]:
nest_asyncio.apply()

In [6]:
documents = SimpleDirectoryReader('../data/raw', recursive=True).load_data()

In [7]:
documents[0]

Document(id_='960a5f18-0f53-4bb8-a702-e3a0d17e5536', embedding=None, metadata={'file_path': 'c:\\Users\\timytapilla\\Documents\\BYU-Idaho\\Fall_semester_2024\\Applied Programming CSE 310\\Projects\\FamilySearchAI\\notebooks\\..\\data\\raw\\attaching-a-source-to.md', 'file_name': 'attaching-a-source-to.md', 'file_size': 1423, 'creation_date': '2024-09-24', 'last_modified_date': '2024-09-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='---\r\ntitle: "\r\n        Using Source Linker to attach a  record to multiple ancestors\r\n    "\r\ndate: "\r\n                June 06, 2024\r\n            "\r\nurl: "https://www.familysearch.org/en/help/helpcenter/article/attaching-a-source-to-multiple-people-in-family-tree"\r\n---\r\n\r', mimetype='text/plain', start

Function to load documents and extract metadata

In [8]:
df = pd.DataFrame([d.metadata for d in documents])
df.head(10)

Unnamed: 0,file_path,file_name,file_size,creation_date,last_modified_date
0,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,attaching-a-source-to.md,1423,2024-09-24,2024-09-24
1,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,attaching-a-source-to.md,1423,2024-09-24,2024-09-24
2,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,creating-new-people-in.md,1616,2024-09-24,2024-09-24
3,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,creating-new-people-in.md,1616,2024-09-24,2024-09-24
4,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,detaching-sources-that-should.md,1556,2024-09-24,2024-09-24
5,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,detaching-sources-that-should.md,1556,2024-09-24,2024-09-24
6,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,detaching-sources-that-should.md,1556,2024-09-24,2024-09-24
7,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,detaching-sources-that-should.md,1556,2024-09-24,2024-09-24
8,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,determining-if-a-source.md,1891,2024-09-24,2024-09-24
9,c:\Users\timytapilla\Documents\BYU-Idaho\Fall_...,determining-if-a-source.md,1891,2024-09-24,2024-09-24


In [6]:
def load_documents_with_metadata(directory):
    """Load documents from the specified directory and extract metadata."""
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.md'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                post = frontmatter.load(file)
                metadata = {
                    'title': post.get('title', '').strip(),
                    'date': post.get('date', '').strip(),
                    'url': post.get('url', '').strip(),
                    'filename': filename  # Save the filename as part of metadata
                }
                content = post.content
                # Create a Document instance instead of a dict
                documents.append(Document(text=content, metadata=metadata))
    return documents

Load documents with metadata

In [7]:
docs = load_documents_with_metadata('../data/raw')
df = pd.DataFrame([d.metadata for d in docs])
df.head(5)

Unnamed: 0,title,date,url,filename
0,Using Source Linker to attach a record to mul...,"June 06, 2024",https://www.familysearch.org/en/help/helpcente...,attaching-a-source-to.md
1,Adding people to the Family Tree through Sourc...,"June 06, 2024",https://www.familysearch.org/en/help/helpcente...,creating-new-people-in.md
2,Using Source Linker to detach sources,"June 06, 2024",https://www.familysearch.org/en/help/helpcente...,detaching-sources-that-should.md
3,Determining in Source Linker if a source match...,"June 06, 2024",https://www.familysearch.org/en/help/helpcente...,determining-if-a-source.md
4,Editing your family tree from Source Linker,"June 06, 2024",https://www.familysearch.org/en/help/helpcente...,editing-information-in-your.md


In [8]:
# Print metadata for verification
for doc in docs:
    print("Metadata:", doc.metadata)

Metadata: {'title': 'Using Source Linker to attach a  record to multiple ancestors', 'date': 'June 06, 2024', 'url': 'https://www.familysearch.org/en/help/helpcenter/article/attaching-a-source-to-multiple-people-in-family-tree', 'filename': 'attaching-a-source-to.md'}
Metadata: {'title': 'Adding people to the Family Tree through Source Linker', 'date': 'June 06, 2024', 'url': 'https://www.familysearch.org/en/help/helpcenter/article/creating-new-people-in-family-tree', 'filename': 'creating-new-people-in.md'}
Metadata: {'title': 'Using Source Linker to detach sources', 'date': 'June 06, 2024', 'url': 'https://www.familysearch.org/en/help/helpcenter/article/detaching-sources-that-should-not-be-attached', 'filename': 'detaching-sources-that-should.md'}
Metadata: {'title': 'Determining in Source Linker if a source matches your ancestor', 'date': 'June 06, 2024', 'url': 'https://www.familysearch.org/en/help/helpcenter/article/determining-if-a-source-matches-your-ancestor', 'filename': 'dete

Create the index if it doesn't exist

In [27]:
# List existing indices
existing_indices = pinecone_client.list_indexes()
print("Existing indices:", existing_indices)

index_name = "chatbot-index"
if index_name not in existing_indices:
    pinecone_client.create_index(index_name, dimension=1536, spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) ) 

Existing indices: [{
    "name": "hints",
    "dimension": 768,
    "metric": "cosine",
    "host": "hints-xnv36h9.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}]


Initialize Pinecone index

In [9]:
pinecone_index = pinecone_client.Index("chatbot-index")
vector_store = PineconeVectorStore(pinecone_index )

Create ingestion pipeline

In [29]:
pipeline = IngestionPipeline(transformations=[SentenceSplitter(chunk_size=1536, chunk_overlap=20), embedding], vector_store=vector_store)


Run pipeline to populate the vector store

In [38]:
pipeline.run(documents=docs)

Upserted vectors: 100%|██████████| 34/34 [00:01<00:00, 21.92it/s]


[TextNode(id_='6f1137de-61e3-4733-a217-56a83297f5f6', embedding=[0.004570330493152142, 0.009227207861840725, -0.026683008298277855, -0.03003835678100586, -0.008208620361983776, 0.02701587975025177, -0.015072437934577465, 0.0030074971728026867, 0.00021241385547909886, -0.0205049067735672, 0.008128730580210686, 0.018947066739201546, -0.01224968396127224, 0.017136242240667343, -0.010245795361697674, 0.002468244871124625, 0.016497129574418068, -0.024179810658097267, 0.009959526360034943, -0.02338091842830181, -0.0055756033398211, -0.006803900934755802, -0.00843497272580862, -0.008421657606959343, -0.014499898068606853, 0.032221995294094086, 0.0274286400526762, -0.029612280428409576, -0.0024099922738969326, 0.0020804491359740496, -0.0002952157228719443, 0.003080728929489851, 0.012702389620244503, -0.045643389225006104, -0.03579038009047508, -0.0025631133466959, -0.005332607310265303, -0.005196129437536001, 0.015671607106924057, 0.004380593542009592, 0.01109794806689024, -0.01216313801705837

In [34]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
retriever = VectorIndexRetriever(index=index, similarity_top_k=5)

Update the prompt template to include metadata at the end

In [83]:
prompt_template = (
    "You are a friendly chatbot specialized in helping beginners use FamilySearch and its tools. 😊 "
    "This includes record hints, source attachments, and other related topics. Provide clear and concise answers, "
    "and try to make the conversation enjoyable! 😄\n\n"
    "Context:\n"
    "#####################################\n"
    "{context_str}\n"
    "Answer the user's question: {query_str}\n\n"
    "If the question is related to FamilySearch or its tools (such as record hints, source attachments, or genealogical research), "
    "provide a detailed answer along with a summary. Also, include the following source metadata as 'Source':\n"
    "- **Title**: {title}\n"
    "- **Publish Date**: {date}\n"
    "- **URL**: {url}\n\n"
    "However, if the question is unrelated to FamilySearch, provide a direct and concise answer without any summary or metadata."
)
# Create the prompt template and response synthesizer
qa_template = PromptTemplate(template=prompt_template)
chain_type_kwargs = {"prompt": qa_template}
response_synthesizer = get_response_synthesizer(
    llm=client, text_qa_template=qa_template, response_mode="compact"
)
query_engine = RetrieverQueryEngine(retriever=retriever, response_synthesizer=response_synthesizer)

Testing the responses

In [80]:
response = query_engine.query("What are record hints in Family Tree?")
display(Markdown(str(response)))

Record hints in Family Tree are historical records that FamilySearch automatically finds for you about your ancestors. The hinting system continuously searches for new records as FamilySearch adds them. These hints can provide valuable information about your ancestors, such as their occupations, physical descriptions, and details about missing family members.

To make the most of record hints, you should:

1. Verify that the record hint pertains to your ancestor.
2. Transfer any desired information from the record hint to Family Tree (note that you cannot replace existing information directly on the website; you must do this manually).
3. Attach the record hint to your ancestor in Family Tree, which saves it as a source.

By utilizing record hints, you can enhance your family history research and build a more comprehensive family tree.

**Source**:
- **Title**: What are record hints in Family Tree?
- **Publish Date**: April 24, 2024
- **URL**: [FamilySearch Help Center](https://www.familysearch.org/en/help/helpcenter/article/what-are-record-hints-in-family-tree)

In [79]:
response = query_engine.query("what is your role?")
display(Markdown(str(response)))

My role is to assist beginners in using FamilySearch and its tools, such as record hints, source attachments, and other related topics. I provide clear and concise answers to help users navigate and utilize FamilySearch effectively.

**Source**:
- **Title**: What is my source box?
- **Publish Date**: May 16, 2023
- **URL**: [What is my source box?](https://www.familysearch.org/en/help/helpcenter/article/what-is-a-source-box)

Testing in Gradio

In [84]:
# Define a function to handle chatbot responses
def respond(message, history):
    try:
        # Query the engine and extract the response
        response = query_engine.query(message)
        information = (
            response.response
        )  # Ensure to extract the textual content from the response object
    except Exception as e:
        # Handle any errors that occur during the response generation
        information = f"Error processing the request: {str(e)}"

    # Append the user's message and the chatbot's response to the chat history
    history.append((message, information))
    return "", history


# Initial welcome message to introduce the chatbot's function
intro_message = "👋 Hello! I'm here to help you with any questions about using FamilySearch! 🌟 Feel free to ask me anything about navigating the tools like record hints, source attachments, and genealogical research."

# Configure the Gradio interface
with gr.Blocks() as demo:
    # Initialize the chatbot with an introductory message
    chatbot = gr.Chatbot(value=[["", intro_message]])
    msg = gr.Textbox(label="Pregunta")
    btn = gr.Button("Enviar")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    # Link the 'send' button to the respond function
    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])

    # Allow users to press Enter to submit their message
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7868
Running on public URL: https://0f86d35e6432d72e6f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
