In [1]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import TokenTextSplitter
import requests
from langchain.docstore.document import Document
from urllib.parse import urlparse, unquote
from openai import OpenAI

In [2]:
Embedding_Model = "nomic-embed-text"

openai = OpenAI(
    api_key="inBDeK_xX6/rg-4kA5__zUJ", # Refer to Create a secret key section
    base_url="https://cloud.olakrutrim.com/v1",
)

### Getting Data from Wikipedia

In [3]:
page_url = "https://en.wikipedia.org/wiki/Swallowtail_butterfly"

In [4]:
def getData(url):

    # Getting the title of the wiki page from the url
    path = urlparse(url).path
    title = unquote(path.split('/')[-1])

    # Calling the wikipedia api 
    endpoint = "https://en.wikipedia.org/w/api.php"
    params = {
        "action" : "query",
        "format" : "json",
        "titles" : title,
        "prop" : "extracts",
        "explaintext" : True
    }

    # Making the API call (Doesn't work on office laptop, ssl cert issue)
    #response = requests.get(endpoint, params = params, verify = False)        # For work laptop but this is security issue
    response = requests.get(endpoint, params=params)
    response.raise_for_status()
    data = response.json()

    pages = data["query"]["pages"]
    page = next(iter(pages.values()))
    if "extract" in page:
        return page["extract"]
    else :
        return None

In [5]:
content = getData(page_url)

### RAG Time :)

In [6]:
# Getting data from the wikipedia document 

data = getData("https://en.wikipedia.org/wiki/Swallowtail_butterfly")
text_splitter = TokenTextSplitter(chunk_size=500,chunk_overlap=100)
documents = text_splitter.split_text(data)
docs = [Document(page_content=document, metadata={"source": "local"}) for document in documents]

In [7]:
# Creating the vector store

db = FAISS.from_documents(docs, OllamaEmbeddings(model = Embedding_Model))
retriever = db.as_retriever() 

In [8]:
# Utility for formatting fetched context 

def combineDocs(docs):
    context = "\n\n".join(f'Document Content : \n{doc.page_content} ]' for doc in docs)
    return context

In [23]:
chatHistory = ["","","","","",""]

def chat(question): 
    
    global chatHistory
    contextString = combineDocs(retriever.invoke(question))
    
    chat_completion = openai.chat.completions.create(
    model="Meta-Llama-3-8B-Instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant. You are given conversation history, context and a question from the user. Answer the question based on the information given in the text or based on the conversation if needed"},
        {"role": "user", "content": chatHistory[-6]},
        {"role": "assistant", "content": chatHistory[-5]},
        {"role": "user", "content": chatHistory[-4]},
        {"role": "assistant", "content": chatHistory[-3]},
        {"role": "user", "content": chatHistory[-2]},
        {"role": "assistant", "content": chatHistory[-1]},
        {"role": "user", "content": "Here is some context : \n" + contextString},
        {"role": "user", "content": question}
    ],
    logit_bias= {2435: -100, 640: -100},
    max_tokens= 2000,
    temperature= 0, # Optional, Defaults to 1. Range: 0 to 2
    top_p= 1 # Optional, Defaults to 1. It is generally recommended to alter this or temperature but not both.
    )

    response = chat_completion.choices[0].message.content

    chatHistory.append(question)
    chatHistory.append(response)

    return response


In [24]:
def rag(question):
    answer = chat(question)
    print(answer)

In [25]:
rag("Can you tell me about Parnassiinae")

Based on the provided context, Parnassiinae is a subfamily of butterflies that belongs to the family Papilionidae. Here are some key points about Parnassiinae:

* Parnassiinae is a subfamily of essentially Holarctic butterflies, meaning they are found in the Northern Hemisphere, primarily in mountainous regions.
* The majority of species in this subfamily belong to the genus Parnassius, commonly known as the Apollos. These butterflies are alpine and can be found at high altitudes.
* Parnassiinae can also be found in other habitats, such as arid deserts, humid forests, and lowland meadows.
* The tribes recognized within Parnassiinae are Parnassiini, Zerynthiini, and Luehdorfiini.
* The Parnassiini tribe contains two genera: Hypermnestra, which is largely confined to central Asia, and Parnassius, which includes many species of Apollos.
* The Luehdorfiini tribe contains the genera Archon and Luehdorfia, which are found in Asia minor and China/Japan, respectively.
* The Zerynthiini tribe i

In [16]:
rag("what do swallowtail butterfly species feed on")

According to the provided context, the caterpillars of various swallowtail butterfly species feed on a wide range of different plants, most depending on only one of five families:

1. Aristolochiaceae (birthwort family)
2. Annonaceae (custard apple family)
3. Lauraceae (laurel family)
4. Umbelliferae (Apiaceae) (carrot family)
5. Rutaceae (citrus family)

By eating some of these toxic plants, the caterpillars sequester aristolochic acid, which renders both the caterpillars and the butterflies of some of these species toxic, thus protecting them from predators.

Some specific examples of host plants for swallowtail butterfly species include:

* Queen Anne's lace (Ammi majus) for the eastern black swallowtail (Papilio polyxenes)
* Carrots, parsley, dill, and fennel (all in the carrot family, Apiaceae) for the eastern black swallowtail (Papilio polyxenes)
* Sedum lanceolatum for the Parnassius smintheus
* Acacia species (family Leguminosae) for Baronia brevicornis

It's worth noting that 