In [1]:
import sys
print(sys.executable)

/usr/local/bin/python3


In [3]:
import torch


In [4]:
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract visible text from paragraphs and headings
        text = " ".join([p.get_text() for p in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])])
        return text
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return None

# Example: Scrape multiple URLs
urls = [
    "https://en.wikipedia.org/wiki/Hampi",
    "https://whc.unesco.org/en/list/241/#",
    
]
documents = {url: extract_text_from_url(url) for url in urls}

# Remove empty results
documents = {url: text for url, text in documents.items() if text}
print(documents)


{'https://en.wikipedia.org/wiki/Hampi': 'Contents Hampi \n Hampi or Hampe (Kannada: [hɐmpe]), also referred to as the Group of Monuments at Hampi, is a UNESCO World Heritage Site located in the town of Hampi in Vijayanagara district, east-central Karnataka, India.[2] Hampi predates the Vijayanagara Empire; it is mentioned in the Ramayana and the Puranas of Hinduism as Pampa Devi Tirtha Kshetra.[3][4] Hampi continues as a religious centre, with the Virupaksha Temple, an active Adi Shankara-linked monastery and various monuments belonging to the old city.[5][6]\n Hampi was the capital of the Vijayanagara Empire from 1336 to 1565 (as Vijayanagara), when it was abandoned.[3] It was a fortified city. Chronicles left by Persian and European travellers, particularly the Portuguese, say that Hampi was a prosperous, wealthy and grand city near the Tungabhadra River, with numerous temples, farms and trading markets. Hampi-Vijayanagara is estimated to be the world\'s second-largest city by 1500, 

In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch

# Load a pre-trained embedding model and move it to GPU
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")

# Convert text into embeddings (ensuring computation is on GPU)
embeddings = np.array([model.encode(text, device=model.device) for text in documents.values()])

print("Embeddings shape:", embeddings.shape)  


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings shape: (2, 384)


In [7]:
import chromadb

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Create a collection
collection = chroma_client.get_or_create_collection(name="text_embeddings")

# Add data to the database
for i, text in enumerate(documents):
    collection.add(
        ids=[str(i)],  # Unique ID
        documents=[text],  # Original text
        embeddings=[embeddings[i].tolist()]  # Store embeddings
    )

print("Stored in ChromaDB!")


Stored in ChromaDB!


In [13]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA

# Use Hugging Face embeddings instead of OpenAI
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load stored vectors
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)

# Use Groq API key
groq_api_key = "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"

# Define Retrieval-Augmented Generation (RAG) pipeline using Groq
qa = RetrievalQA.from_chain_type(
    llm=ChatGroq(model="llama3-8b-8192", groq_api_key=groq_api_key),  # Change model as needed
    retriever=vectorstore.as_retriever()
)

def is_relevant_to_hampi(prompt):
    keywords = ["Hampi", "monument", "Vijayanagara", "temple"]
    return any(keyword.lower() in prompt.lower() for keyword in keywords)

# # Query the database
prompt = input("Enter the prompt: ")

if is_relevant_to_hampi(prompt):
    result = qa.run(prompt)
    print(result)
else:
    print("Your query is not relevant to Hampi cultural sites.")

Your query is not relevant to Hampi cultural sites.
