In [13]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import TokenTextSplitter
import ollama
from bs4 import BeautifulSoup
import requests
from langchain.docstore.document import Document
from urllib.parse import urlparse, unquote
from langchain.prompts import PromptTemplate

In [14]:
Embedding_Model = "nomic-embed-text"
Model = Ollama(model="llama3.1")

### Getting Data from Wikipedia

In [3]:
page_url = "https://en.wikipedia.org/wiki/Swallowtail_butterfly"

In [4]:
def getData(url):

    # Getting the title of the wiki page from the url
    path = urlparse(url).path
    title = unquote(path.split('/')[-1])

    # Calling the wikipedia api 
    endpoint = "https://en.wikipedia.org/w/api.php"
    params = {
        "action" : "query",
        "format" : "json",
        "titles" : title,
        "prop" : "extracts",
        "explaintext" : True
    }

    # Making the API call (Doesn't work on office laptop, ssl cert issue)
    #response = requests.get(endpoint, params = params, verify = False)        # For work laptop but this is security issue
    response = requests.get(endpoint, params=params)
    response.raise_for_status()
    data = response.json()

    pages = data["query"]["pages"]
    page = next(iter(pages.values()))
    if "extract" in page:
        return page["extract"]
    else :
        return None

In [5]:
content = getData(page_url)

### RAG Time :)

In [10]:
# Getting data from the wikipedia document 

data = getData("https://en.wikipedia.org/wiki/Swallowtail_butterfly")
text_splitter = TokenTextSplitter(chunk_size=500,chunk_overlap=100)
documents = text_splitter.split_text(data)
docs = [Document(page_content=document, metadata={"source": "local"}) for document in documents]

In [11]:
# Creating the vector store

db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model))
retriever = db.as_retriever() 

In [12]:
# Utility for formatting fetched context 

def combineDocs(docs):
    context = "\n\n".join(f'Document Content : \n{doc.page_content} ]' for doc in docs)
    return context

In [19]:
chatHistory = "\n"

latestPrompt = ""

template = """
You are a helpful assistant. You are given some text, conversation history and a question. 
Answer the question based on the information given in the text or based on the conversation if needed

## Text ##
{context}

## Conversation ##
{conversation}
\n
Question : {question}
Answer: 

"""

prompt = PromptTemplate.from_template(template)

def chat(question): 
    
    global chatHistory
    global latestPrompt
    # Retrieve docs from vector DB
    #docs = vectors.similarity_search(question, k = 4)
    #contextString = combineDocs2(docs)
    contextString = combineDocs(retriever.invoke(question))
    
    query = prompt.format(conversation = chatHistory, context = contextString, question = question)
    latestPrompt = query
    response = Model.invoke(query)
    
    chatHistory = chatHistory + "\nQuestion : " + question + "\nAnswer : " + response

    return response


In [20]:
def rag(question):
    answer = chat(question)
    print(answer)

In [21]:
rag("Can you tell me about Parnassiinae")

The Parnassiinae is a subfamily of essentially Holarctic butterflies, with the vast majority of species found in mountain habitats. Most Parnassius species can be found in high-altitude regions and have two small reddish spots on their hindwings.

There are three tribes recognized within the Parnassiinae: Parnassiini, Zerynthiini, and Luehdorfiini. The tribe Parnassiini contains two genera, Hypermnestra and Parnassius (the Apollos), with most species found in central Asia and capable of living at high altitudes.

The tribe Luehdorfiini includes the genera Archon of Asia minor and the genus Luehdorfia of China and Japan. These two tribes have evolved to change their food plants, while the third tribe, Zerynthiini, has retained the archetypical papilionid food plant, Aristolochia.

The Parnassiinae are known for their unique adaptations to high-altitude environments and their diversity in terms of species and habitats.


In [22]:
rag("what do swallowtail butterfly species feed on")

Here is the rewritten content based on your questions:

**Taxonomy**

The genera of extant swallowtails are usually classified into three subfamilies, Baroniinae, Parnassiinae, and Papilioninae. The Parnassiinae is a subfamily of essentially Holarctic butterflies, with the vast majority of species found in mountain habitats.

* Subfamily: Baroniene.
	+ Family Aristolochiaceae (some sources mention 4th family: Rutaceae)
**Food**

The caterpillars of various swallowtail butterfly species feed on a wide range of different plants. Most depend on only one of the following five families:
	1. Aristolochiaceae
	2. Annonaceae
	3. Lauraceae
	4. Umbelliferae (Apiaceae)
	5. Rutaceae

By eating some of these toxic plants, the caterpillars sequester aristolochic acid which renders both the caterpillars and the butterflies of some of these species as toxic, thus protecting them from predators.

**Life Cycle**

The detailed descriptions of morphological characteristics of the Papilionidae are as follo