In [None]:
!pip install -q pyngrok

In [None]:
!pip install -q google-generativeai

In [None]:
!pip install -q unstructured

In [None]:
!pip install -q tiktoken

In [None]:
!pip install -q streamlit langchain openai faiss-cpu

In [None]:
!pip install -q langchain_community

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from dotenv import load_dotenv

# path the .env file on the drive
path_env = '/content/drive/MyDrive/Colab Notebooks/ArtiQuery project/env'
# Load environment variables
load_dotenv(path_env)

True

In [None]:
!pip install -q  langchain-google-genai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%writefile app.py

import os
import streamlit as st
import pickle
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS


# Streamlit UI
st.title("ArtiQuery 🧐")
st.sidebar.title("Article URLs")

# Collect URL inputs (ignoring empty values)
urls = [url for url in [
    st.sidebar.text_input(f"URL {i+1}", key=f"url_{i}") for i in range(3)
] if url]

process_url_clicked = st.sidebar.button("Process URLs")

# Define file path for saving the FAISS index
file_path = "faiss_store.pkl"

# Initialize the Gemini model using ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", google_api_key=os.getenv("GEMINI_API_KEY"))

# Use session state to manage FAISS readiness
if "faiss_ready" not in st.session_state:
    st.session_state.faiss_ready = False

if process_url_clicked and urls:
    st.info("Loading and processing articles... ⌛")
    loaders = UnstructuredURLLoader(urls=urls)
    data = loaders.load()

    # Split documents into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.split_documents(data)

    # Create embeddings and build FAISS index
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    faiss_index = FAISS.from_documents(docs, embeddings)

    # Save FAISS index to file
    with open(file_path, "wb") as f:
        pickle.dump(faiss_index, f)

    st.session_state.faiss_ready = True
    st.success("Documents processed! You can now ask your question 👇")

if st.session_state.faiss_ready:
    query = st.text_input("Ask a question about the articles:")
    if query:
        # Load FAISS index from file
        with open(file_path, "rb") as f:
            vectorstore = pickle.load(f)

        # Retrieve relevant documents
        retriever = vectorstore.as_retriever()
        retrieved_docs = retriever.get_relevant_documents(query)

        # Combine document content for context
        context = "\n\n".join([doc.page_content for doc in retrieved_docs])

        # Prompt with detailed instruction and clear separation
        prompt = (
            "You are a knowledgeable and reliable expert assistant skilled in synthesizing complex information. "
            "Below is the aggregated context extracted from various documents followed by a question. "
            "Using the provided context, please generate a clear, comprehensive, and concise answer. "
            "If relevant, include actionable insights and cite key points from the context. \n\n"
            "Context:\n"
            f"{context}\n\n"
            "Question:\n"
            f"{query}\n\n"
            "Answer:"
        )

        # Get the answer from the Gemini model
        response = llm.invoke(prompt)

        st.header("Answer")
        st.write(response.content)

        # Deduplicate and display the source URLs
        sources = [doc.metadata.get("source", "") for doc in retrieved_docs if doc.metadata.get("source", "")]
        unique_sources = list(set(sources))
        st.markdown("### Sources")
        for src in unique_sources:
            st.markdown(f"- [{src}]({src})")


Overwriting app.py


In [None]:
# Step 3: Run the app with ngrok
from pyngrok import ngrok
import time
import os
from dotenv import load_dotenv


# Set ngrok auth token
ngrok.set_auth_token(os.getenv("NGROK_AUTH_TOKEN"))

# Kill existing tunnels
ngrok.kill()

# Start Streamlit server
get_ipython().system_raw('streamlit run app.py --server.port 8501 &')

# Wait for server to start
time.sleep(5)

# ✅ Create ngrok tunnel correctly
public_url = ngrok.connect(addr="8501", bind_tls=True)
print(f"Streamlit app running at: {public_url}")

Streamlit app running at: NgrokTunnel: "https://91c8-35-237-30-54.ngrok-free.app" -> "http://localhost:8501"
