<a href="https://colab.research.google.com/github/Pavithra2625/News_Research_Tool/blob/main/News_Research_Tool_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install gradio langchain langchain-community langchain-groq faiss-cpu sentence-transformers unstructured requests


Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-groq
  Downloading langchain_groq-0.3.7-py3-none-any.whl.metadata (2.6 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting unstructured
  Downloading unstructured-0.18.14-py3-none-any.whl.metadata (24 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting groq<1,>=0.30.0 (from langchain-groq)
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Colle

In [3]:
import os
import pickle
import time
import gradio as gr

from langchain_groq import ChatGroq
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
userdata.get('groq_API')


# 1. Set Groq API Key
os.environ["GROQ_API_KEY"] = "groq_API"

# 2. Initialize LLM (Groq)
llm = ChatGroq(
    api_key=os.environ["GROQ_API_KEY"],
    model_name="llama3-70b-8192",  # Groq supported model
    temperature=0.7
)

file_path = "faiss_store_groq.pkl"
vectorstore = None

# 3. Function to process URLs and build FAISS index
def process_urls(url1, url2, url3):
    urls = [u for u in [url1, url2, url3] if u.strip()]
    if not urls:
        return "Please provide at least one valid URL."

    loader = UnstructuredURLLoader(urls=urls)
    try:
        data = loader.load()
    except Exception as e:
        return f"Error loading URLs: {str(e)}"

    if not data:
        return "No content fetched from the provided URLs."

    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    docs = text_splitter.split_documents(data)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore_groq = FAISS.from_documents(docs, embeddings)

    with open(file_path, "wb") as f:
        pickle.dump(vectorstore_groq, f)

    return "Processing completed! You can now ask questions."

def answer_question(question):
    if not os.path.exists(file_path):
        return "Please process URLs first!", ""

    with open(file_path, "rb") as f:
        vectorstore = pickle.load(f)

    chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())

    try:
        result = chain({"question": question}, return_only_outputs=True)
    except Exception as e:
        return f"Error generating answer: {str(e)}", ""

    answer = result.get("answer", "No answer found.")
    sources = result.get("sources", "")

    return answer, sources or "No sources available."

# 5. Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# RockyBot: News Research Tool (Groq + Gradio)")

    with gr.Row():
        url1 = gr.Textbox(label="URL 1")
        url2 = gr.Textbox(label=" URL 2")
        url3 = gr.Textbox(label="URL 3")
        url4 = gr.Textbox(label="URL 4")
        url5 = gr.Textbox(label="URL 5")
        url6 = gr.Textbox(label="URL 6")
        url7 = gr.Textbox(label="URL 7")
    process_btn = gr.Button("Process URLs")
    status_output = gr.Textbox(label="Status")

    process_btn.click(process_urls, [url1, url2, url3], status_output)

    question_input = gr.Textbox(label="Ask a Question")
    answer_output = gr.Textbox(label="Answer")
    sources_output = gr.Textbox(label="Sources")

    question_input.submit(answer_question, question_input, [answer_output, sources_output])

# Launch the app
demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fbdc9fb952c0b29232.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [4]:
pip install langchain-groq




In [None]:
!pip install langchain langchain-community langchain-groq gradio faiss-cpu sentence-transformers unstructured requests


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting unstructured
  Downloading unstructured-0.18.11-py3-none-any.whl.metadata (24 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstruc

In [5]:
import os
import pickle
import time
import gradio as gr

from langchain_groq import ChatGroq
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
userdata.get('News_API')


# 1. Set Groq API Key
os.environ["GROQ_API_KEY"] = "News_API"

# 2. Initialize LLM (Groq)
llm = ChatGroq(
    api_key=os.environ["GROQ_API_KEY"],
    model_name="llama3-70b-8192",  # Groq supported model
    temperature=0.7
)

file_path = "faiss_store_groq.pkl"
vectorstore = None

# 3. Function to process URLs and build FAISS index
def process_urls(url1, url2, url3):
    urls = [u for u in [url1, url2, url3] if u.strip()]
    if not urls:
        return "Please provide at least one valid URL."

    loader = UnstructuredURLLoader(urls=urls)
    try:
        data = loader.load()
    except Exception as e:
        return f"Error loading URLs: {str(e)}"

    if not data:
        return "No content fetched from the provided URLs."

    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    docs = text_splitter.split_documents(data)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore_groq = FAISS.from_documents(docs, embeddings)

    with open(file_path, "wb") as f:
        pickle.dump(vectorstore_groq, f)

    return "Processing completed! You can now ask questions."

def answer_question(question):
    if not os.path.exists(file_path):
        return "Please process URLs first!", ""

    with open(file_path, "rb") as f:
        vectorstore = pickle.load(f)

    chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())

    try:
        result = chain({"question": question}, return_only_outputs=True)
    except Exception as e:
        return f"Error generating answer: {str(e)}", ""

    answer = result.get("answer", "No answer found.")
    sources = result.get("sources", "")

    return answer, sources or "No sources available."

# 5. Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# RockyBot: News Research Tool (Groq + Gradio)")

    with gr.Row():
        url1 = gr.Textbox(label="URL 1")
        url2 = gr.Textbox(label=" URL 2")
        url3 = gr.Textbox(label="URL 3")
    process_btn = gr.Button("Process URLs")
    status_output = gr.Textbox(label="Status")

    process_btn.click(process_urls, [url1, url2, url3], status_output)

    question_input = gr.Textbox(label="Ask a Question")
    answer_output = gr.Textbox(label="Answer")
    sources_output = gr.Textbox(label="Sources")

    question_input.submit(answer_question, question_input, [answer_output, sources_output])

# Launch the app
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://125ea3c6a0c22343a7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


