# Installing and Importing Necessary Libraries

In [None]:
!pip install -q langchain
!pip install -q chromadb

In [None]:
!pip install -q langchain-google-genai
!pip install -q google-generativeai

In [None]:
!pip install -q unstructured

In [None]:
!pip install -q sentence_transformers

In [None]:
!pip install -q langchain
!pip install -q gnews

In [None]:
!pip install -q text-generation transformers google-search-results numexpr langchainhub sentencepiece jinja2

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from gnews import GNews
import textwrap

In [None]:
os.environ["GOOGLE_API_KEY"] = "" # Enter your Gemini API key.

# Retrieving Latest News

In [None]:
def get_title_and_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.title.string if soup.title else "No title found"
        content = '\n'.join([p.get_text() for p in soup.find_all('p')])
        return title, content

    else:
        print("Failed to fetch URL:", url)
        return None, None

In [None]:
def pretty_print(text, width = 80):
    lines = textwrap.wrap(text, width=width)
    return '\n'.join(lines)

In [None]:
topic = '' # Enter your topic of interest such as Artificial Intelligence, Natural Language Processing, etc.

In [None]:
google_news = GNews()
news = google_news.get_news(topic)

In [None]:
data_directory = "data"
if not os.path.exists(data_directory):
    os.makedirs(data_directory)

stored_urls = []
count = 1
files_saved = 0
while files_saved < 10:
    file_path = os.path.join(data_directory, f"{topic} - {files_saved + 1}.txt")
    title, content = get_title_and_content(news[count - 1]["url"])
    if title and content:
        with open(file_path, "a") as f:
            f.write("Title: " + title + "\n")
            f.write("Content: " + content + "\n")
            stored_urls.append(news[count - 1]["url"])
        files_saved += 1
    count += 1

print("Files saved successfully.")

print("Stored URLs:")
for url in stored_urls:
    print(url)

# RAG-based Query

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.llms import OpenAI
import getpass
import shutil

embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")

CHROMA_PATH = "chroma" # Change this everytime you run this cell again
DATA_PATH = r"data"


def main():
    generate_data_store()


def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100,
                                                   length_function = len, add_start_index = True)
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[1]
    print(document.page_content)
    print(document.metadata)

    return chunks

def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    db = Chroma.from_documents(chunks, embeddings, persist_directory = CHROMA_PATH)
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")


if __name__ == "__main__":
    main()

In [None]:
query_text = "" # Enter your query

In [None]:
from dataclasses import dataclass
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import GoogleGenerativeAI

PROMPT_TEMPLATE = """
Please consider the following information as the background for your response. Ensure that your answer is relevant to the details provided in this context.

{context}

---

Here is the specific inquiry you are tasked with addressing. Make sure your response directly addresses this question and utilizes the information provided in the context.

{question}
"""

db = Chroma(persist_directory = CHROMA_PATH, embedding_function = embeddings)

results = db.similarity_search_with_relevance_scores(query_text, k = 4)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

model = GoogleGenerativeAI(model = "gemini-pro", max_output_tokens = 1024,
                           google_api_key = os.environ["GOOGLE_API_KEY"])

response_text = model.invoke(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)