## Requirements

In [1]:
!pip install requests beautifulsoup4 transformers chromadb langchain llama-cpp-python langchain_community groq llama_index

Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-cpp-python
  Downloading llama_cpp_python-0.2.76.tar.gz (49.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting langchain_community
  Downloading langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [23]:
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModel
import chromadb
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA
import torch
import requests
import json
import groq
import os
import re

## Data Extraction and Cleaning

In [37]:
def extract_text_from_website(url):
    """
    This function extracts text content from a given website URL.

    Args:
        url (str): The URL of the website to extract text from.

    Returns:
        str: The cleaned text content extracted from the website, or None if an error occurs.
    """

    # Get response from the server
    response = requests.get(url)
    if response.status_code == 500:
        print("Server error")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Removing js and css code
    for script in soup(["script", "style"]):
        script.extract()

    # Extract relevant text content
    text = soup.get_text()

    # Clean the text (remove extra whitespaces, etc.)
    cleaned_text = text.strip().replace('\n', ' ')
    cleaned_text = re.sub(r"(\w)-\n(\w)", r"\1\2", cleaned_text)

    return cleaned_text


## Text Processing (Chunking)

In [38]:
def chunk_text(text, chunk_size=2048, overlap=128):
    """
    This function splits a given text into smaller chunks.

    Args:
        text (str): The text to be chunked.
        chunk_size (int, optional): The desired size of each chunk in characters. Defaults to 2048.
        overlap (int, optional): The number of characters to overlap between consecutive chunks. Defaults to 128.

    Returns:
        list: A list of strings, where each element represents a chunk of the original text.
    """

    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        """
        The loop iterates with a step size of 'chunk_size - overlap' to ensure overlap between chunks.
        """
        end_index = min(i + chunk_size, len(text))
        chunks.append(text[i:end_index])
    return chunks


## Embedding and Indexing

In [39]:
def embed_and_index_text(chunks):
    """
    This function embeds text chunks and indexes them in a ChromaDB collection.

    Args:
        chunks (list): A list of text chunks extracted from a website.
    """

    # Load pre-trained tokenizer and model for text embedding
    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
    model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")

    # Connect to ChromaDB and create a collection for storing data
    db = chromadb.Client()
    collection = db.create_collection("website_data")

    # Process each text chunk
    for i, chunk in enumerate(chunks):
        input_data = tokenizer(chunk, return_tensors="pt")

        # Generate embedding for the chunk using the pre-trained model
        with torch.no_grad():
            embedding = model(**input_data).pooler_output[0].tolist()

        # Add the chunk data to the ChromaDB collection
        collection.add(
            ids=[str(i)],
            documents=[chunk],
            embeddings=[embedding]
        )

## Retrieval

In [40]:
def retrieve_relevant_chunks(query, top_k=3):
    """
    This function retrieves the most relevant text chunks from the indexed data based on a user query.

    Args:
        query (str): The user's question or search term.
        top_k (int, optional): The maximum number of most relevant chunks to return. Defaults to 3.

    Returns:
        list: A list containing the top 'top_k' most relevant text chunks (strings) from the indexed data.
    """

    # Load pre-trained tokenizer and model for text embedding
    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
    model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")

    # Connect to ChromaDB and retrieve previously created collection
    db = chromadb.Client()
    collection = db.get_collection("website_data")

    # Prepare the query for model input using the tokenizer
    query_input_data = tokenizer(query, return_tensors="pt")

    # Generate embedding
    with torch.no_grad():
        query_embedding = model(**query_input_data).pooler_output[0].tolist()

    # Search for similar documents in the ChromaDB collection
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)

    return results["documents"][0]


## Response Generation (using Groq and Llama3)

In [41]:
def generate_answer(chunks, query):
    """
    This function generates an answer to the user's question based on the retrieved text chunks and a large language model.

    Args:
        chunks (list): A list of text chunks retrieved from the indexed data.
        query (str): The user's question.

    Returns:
        str: The generated answer to the user's question.
    """

    os.environ["GROQ_API_KEY"] = "groq-api-key"  # Replace with the actual key
    client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))

    # Define the system prompt for GROQ client
    system_prompt = """You are a helpful AI assistant who specializes in extracting data from Wordpress sites. Please use the following context to answer the question at the end and please answer it completely.
    If you don't know the answer, just say you don't know, don't try to make up an answer. Use Chain of Thought Strategy for answering. Do not hallucinate.

    Context:
    {context}

    Question: {question}
    Answer:"""

    # Combine retrieved chunks
    context = " ".join(chunks)

    # Format the system prompt
    prompt = system_prompt.format(context=context, question=query)

    # Use GROQ client to call the chat completion endpoint
    chat_completion = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        #max_tokens=200,
        top_p=1
    )

    answer = chat_completion.choices[0].message.content
    return answer


## main Function

In [42]:
if __name__ == "__main__":

    website_url = "https://myexpressionofthoughtsblog.wordpress.com/about/"

    text = extract_text_from_website(website_url)
    chunks = chunk_text(text)

    # Run the following line only the first time the script is executed (commented out)
    # embed_and_index_text(chunks)

    """
    This function is used to embed and index the text chunks for later retrieval,
    but it's commented out to avoid redundant indexing on subsequent runs.
    """

    while True:
        query = input("Ask a question about the website ('END' to exit): ")
        if query.upper() == "END":  # Check for "END/end"
            break
        relevant_chunks = retrieve_relevant_chunks(query)
        answer = generate_answer(relevant_chunks, query)
        print("Answer:", answer)


Ask a question about the website ('END' to exit): What is the author's name?
Answer: Let's break down the text to find the answer.

The text mentions "About The Author" multiple times, which suggests that the author's information is present in the text.

Upon closer inspection, I found the following sentence: "Hi Friends, My name is Tanvir Kaur."

This sentence explicitly states the author's name, which is Tanvir Kaur.

Therefore, the answer is: Tanvir Kaur.
Ask a question about the website ('END' to exit): What is author's motto in life?
Answer: Let's break down the text to find the answer.

The context is the "About The Author" section of a Wordpress site. We need to find the author's motto in life.

After reading the text, I found the relevant sentence: "My motto of life is: Live your life fully by adding a pinch of excitement and happiness."

So, the answer is: "Live your life fully by adding a pinch of excitement and happiness."
Ask a question about the website ('END' to exit): Is