In [None]:
!pip install langchain-community
!pip install langchain-experimental
!pip install langchain-groq # install the missing module
!pip install faiss-cpu
!pip install wikipedia

In [None]:
import os
import uuid
import json
from langchain_community.retrievers import WikipediaRetriever
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer, util
import faiss
from bs4 import BeautifulSoup

# Set API Keys
# LANGSMITH-KEY=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
os.environ["LANGSMITH_API_KEY"] = "YOUR_LANGSMITH_KEY"
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["GROQ_API_KEY"] = "YOUR_GROQ_API_KEY"

# Initialize components
retriever = WikipediaRetriever()
llm = ChatGroq(model="llama3-70b-8192")
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Pre-trained embedding model

# Parse HTML content and strip tags
def parse_html(html):
    soup = BeautifulSoup(html, features="html.parser")
    return soup.get_text()

def format_docs(docs):
    return "\n\n".join(parse_html(doc.page_content) for doc in docs)

# Semantic chunking
def semantic_chunking(text, similarity_threshold=0.75, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)

    # Split text into sentences
    sentences = text.split(". ")
    sentence_embeddings = model.encode(sentences)

    chunks = []
    current_chunk = []
    current_chunk_embeddings = []

    for i, (sentence, embedding) in enumerate(zip(sentences, sentence_embeddings)):
        if not current_chunk:
            current_chunk.append(sentence)
            current_chunk_embeddings.append(embedding)
            continue

        # Compute similarity with the current chunk
        chunk_embedding_avg = sum(current_chunk_embeddings) / len(current_chunk_embeddings)
        similarity_score = util.cos_sim(embedding, chunk_embedding_avg).item()

        if similarity_score >= similarity_threshold:
            current_chunk.append(sentence)
            current_chunk_embeddings.append(embedding)
        else:
            # Finalize the current chunk
            chunk_text = ". ".join(current_chunk).strip()
            chunks.append({
                "chunk_id": str(uuid.uuid4()),
                "chunk_text": chunk_text
            })
            current_chunk = [sentence]
            current_chunk_embeddings = [embedding]

    if current_chunk:
        chunk_text = ". ".join(current_chunk).strip()
        chunks.append({
            "chunk_id": str(uuid.uuid4()),
            "chunk_text": chunk_text
        })

    return chunks

# Store chunks in FAISS vector database
def store_in_faiss(chunks):
    texts = [chunk["chunk_text"] for chunk in chunks]
    embeddings = embedder.encode(texts)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    return index, chunks

# Re-rank outputs to return top 5 results
def rerank_outputs(query, chunks):
    query_embedding = embedder.encode(query)
    chunk_embeddings = embedder.encode([chunk["chunk_text"] for chunk in chunks])
    scores = util.cos_sim(query_embedding, chunk_embeddings).squeeze().tolist()

    # Combine scores with chunk IDs
    scored_chunks = [(score, chunk["chunk_id"], chunk["chunk_text"]) for score, chunk in zip(scores, chunks)]

    # Sort by score and return top 5 results
    ranked = sorted(scored_chunks, key=lambda x: x[0], reverse=True)
    return ranked[:5]  # Top 5 results
url_retrieval_prompt = ChatPromptTemplate.from_template(
    """
    Given the following question, find and return the Wikipedia URL(s) for the topic(s):
    Question: {question}
    Please provide the Wikipedia page link(s) related to this topic.
    """
)

# Rewrite content using LLM
def rewrite_chunk_with_llm(chunk_text, query):
    rewrite_prompt = ChatPromptTemplate.from_template(
        """
        Rewrite the following content:

        Content: {chunk_text}

        Please provide a rewritten version of the content in a clear and more understandable.
        donot give this type of things in the response - "Here is a rewritten version of the content:"
        donot give Detailed Timeline,Related Topics,References
        Respond with the rewritten content only.
        """
    )
    # Format the prompt
    prompt = rewrite_prompt.format_prompt(query=query, chunk_text=chunk_text).to_string()

    # Invoke the LLM with the prompt
    response = llm.invoke(prompt)

    # Extract the content directly from the response object
    return response.content.strip()

# Main execution
# Main execution
try:
    query = input("Enter your query: ")
    retrieved_docs = retriever.invoke(query)
    if not retrieved_docs:
        raise ValueError("No documents retrieved!")

    formatted_context = format_docs(retrieved_docs)
    retrieved_chunks = semantic_chunking(formatted_context, similarity_threshold=0.75)
    faiss_index, faiss_chunks = store_in_faiss(retrieved_chunks)
    ranked_results = rerank_outputs(query, faiss_chunks)


    for rank, (score, chunk_id, chunk_text) in enumerate(ranked_results, start=1):
        rewritten_text = rewrite_chunk_with_llm(chunk_text, query)
        print(f"\n{rewritten_text}\n<{chunk_id}>\n")

    url_prompt = url_retrieval_prompt.format(question=query)
    url_response = llm.invoke(url_prompt).content.strip()

    print("\nRelevant Wikipedia URLs:")
    print(url_response)

except Exception as e:
    print("Error during execution:", str(e))

