In [5]:
import os
from bs4 import BeautifulSoup
import re
import streamlit as st
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import google.generativeai as genai

In [6]:
# set gemini api key as environment variable
os.environ['GEMINI_API_KEY'] = '********'

In [7]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    content = re.sub(r"\n\n+", "\n\n", soup.text).strip()
    content = re.sub(r'<.*?>', '', content)
    content = re.sub(r'\n\s*\n', '\n', content)
    return content

In [8]:
def load_data(url):
    # Define the base URL and configure RecursiveUrlLoader
    base_url = url
    
    custom_headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        }

    loader = RecursiveUrlLoader(base_url, headers = custom_headers, extractor = bs4_extractor)

    docs = loader.load()
    return docs

In [9]:
documents = load_data('https://hashagile.com/')

  soup = BeautifulSoup(html, "lxml")
  k = self.parse_starttag(i)


In [10]:
documents[:5]

[Document(page_content='Home - Hash Agile\nContact Us\nHome\nExpertise\nCareers\nBlog\nContact Us\nAgile solutions for Digital Transformation Innovative Big Data services, custom AI & ML solutions, predictive Visual Analytics, blazing fast Web & Mobile applications \nStay ahead of competition with innovative Big Data and AI solutions implemented in quick iterations   \nData Engineering\nRealize your data’s competitive edge with critical infrastructure and data pipelines from discovery to analysis\nAI / ML\nTrain and deploy deep learning models and create custom AI powered products and solutions\nNLP\nState of the art natural language processing to identify key features and distill valuable insights from data sources of any size\nPredictive Visual Analytics\nBuild predictive models with advanced analytics and machine learning techniques. Intuitive visualizations with interactive graphs and dashboards.\nOver 150k+ Client Target Audience for your business.  \nAI technology is perfect for 

In [11]:
def embed_and_load(documents):
    # split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
    docs = text_splitter.split_documents(documents)

    # embed load to chromaDB
    embeddings = SentenceTransformerEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
    db = Chroma.from_documents(docs, embeddings)

    return db

In [12]:
db = embed_and_load(documents)

  warn_deprecated(


In [13]:
def make_rag_prompt(db, query):
    # get relevant data
    relevant_data = db.similarity_search(query = query)

    # create a prompt for LLM
    prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
          Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
          strike a friendly and converstional tone. \
          If the passage is irrelevant to the answer, you may ignore it.
          QUESTION: '{query}'
          PASSAGE: '{relevant_data}'

          ANSWER:
          """).format(query = query, relevant_data = relevant_data)
    
    return prompt

In [14]:
def generate_answer(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key = gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text



In [15]:
def final_answer(db, query):
    prompt = make_rag_prompt(db, query)
    answer = generate_answer(prompt)

    return answer

In [16]:
answer = final_answer(db, 'What does hash agile do?')

In [17]:
print(answer)

Hash Agile is a technology company that offers services such as data engineering, AI / ML, NLP, and predictive visual analytics.
