In [1]:
# !pip install -r ../requirements.txt

In [2]:
# !conda install poppler -y

In [3]:
#!pip install unstructured

In [4]:
# for loading and parcing web pages
from langchain_community.document_loaders import SeleniumURLLoader

In [5]:
# for reading and parcing multimodal pdf
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.utils.constants import PartitionStrategy

In [6]:
# for help of open-source LLMs
from langchain_ollama import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

In [7]:
# for text pre-processing
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore

In [8]:
import streamlit as st

In [9]:
# !ollama pull gemma3:12b

In [10]:
# !ollama pull llama3.2:3b

In [11]:
llm = OllamaLLM(model = "gemma3:12b")

In [12]:
embeddings = OllamaEmbeddings(model="llama3.2:3b")
vector_store = InMemoryVectorStore(embeddings)

In [13]:
# load page
def load_web_page(url):
    loader = SeleniumURLLoader(urls=[url])
    documents = loader.load()
    return documents

In [14]:
# parse the page text
def split_web_text(docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True
    )
    data = text_splitter.split_documents(docs)
    return data


In [15]:
def store_web_docs(docs):
    vector_store.add_documents(docs)
    return

In [16]:
def retrieve_docs(query):
    return vector_store.similarity_search(query)

In [17]:
def answer_question(question, documents):
    context = "\n\n".join([doc.page_content for doc in documents])
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | llm
    return chain.invoke({"question": question, "context": context})

In [18]:
# A template for the dialoque
template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

In [19]:
url = 'https://escinsight.com/2023/04/06/deeper-look-eurovisions-running-order/'

In [20]:
# read a web page
doc = load_web_page(url)

In [21]:
# split the text in chunks
text = split_web_text(doc)

In [22]:
# embed the chunks and store them in vector db
store_web_docs(text)

In [23]:
question = 'what running order is the best?'

In [24]:
# send the user's question to the vector db for retrieving relevant context
retrieved = retrieve_docs(question)

In [25]:
retrieved

[Document(id='424b6770-0688-4f10-8256-8eb0720be051', metadata={'source': 'https://escinsight.com/2023/04/06/deeper-look-eurovisions-running-order/', 'title': 'ESC Insight | A Deeper Look At Eurovision’s Running Order', 'description': 'No description found.', 'language': 'No language found.', 'start_index': 10118}, page_content='Finally, we have the bottom left in light orange, which is where we find acts that did better in the final among both jurors and televoters. The position that televoters reward the most is 25th, which is usually the penultimate slot (although in 2022 it was the final slot, on account of the contest having previously been won by Italy, one of the Big Five). The slot just before this – 24th – is the position that jurors reward the most. Almost all of these slots can be found towards the end of their respective halves: this isn’t true for 5th and 15th, but these only just squeeze into the category.\n\nSo Where Do I Want To Sing?'),
 Document(id='fa9b6313-9985-43c6-

In [26]:
answer = answer_question(question, retrieved)

In [27]:
answer

'There\'s no definitively "best" running order position. However, positions 13th and 26th (last) tend to see acts drop in rankings among their fellow semi-final qualifiers. The 24th position is favored by jurors, while the 25th position is favored by televoters.'