# Data Ingestion 

- ###  Using Wikipedia of "volcanos"

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup,SoupStrainer
loader = WebBaseLoader(
    web_path="https://en.wikipedia.org/wiki/Volcano",
    bs_kwargs={
        "parse_only": SoupStrainer(
            class_=["mw-content-ltr mw-parser-output"]    ## Scrapping Only the main content 
        )
    }
)
loader.requests_kwargs = {'verify':False}   # disable SSL certificate verification
raw_docs=loader.load()

## Cleaning noise data

In [None]:
import re 

raw_text=raw_docs[0].page_content
def clean_wikipedia_text(text):
    
    # 1. Removing citation brackets like [1], [24]
    text = re.sub(r'\[\d+\]', '', text)
    
    # 2. Removinf [edit] text from headers
    text = re.sub(r'\[edit\]', '', text)
    
    # 3. Standardize whitespace (replace multiple newlines with one)
    text = re.sub(r'\n+', '\n', text)
    
    # 4. Remove leading/trailing whitespace
    return text.strip()

cleaned_text = clean_wikipedia_text(raw_text)

# Chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=80,
    # length_function=len,
    # separators=["\n\n", "\n", ".", " ", ""] # Spliting by Paragraphs first, then Sentences
)
chunks = text_splitter.split_text(cleaned_text)

# Metadata Enrichment

In [None]:
from langchain_core.documents import Document
enriched_docs = [
    Document(
        page_content=chunk, 
        metadata={"source": "wikipedia", "topic": "volcano", "chunk_id": i}
    ) 
    for i, chunk in enumerate(chunks)
]

## Viewing Cleaned Chunk Data 

In [None]:
print(f"We get {len(enriched_docs)} documents with metadata.")
print(f"Example Metadata: {enriched_docs[0].metadata}")

# Embedding

In [None]:

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Vector_Db


In [None]:

vector_db = FAISS.from_documents(enriched_docs, embeddings)

## Reranking

In [None]:
from langchain_classic.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_community.document_compressors import FlashrankRerank
from langchain_openai import ChatOpenAI

base_retriever = vector_db.as_retriever(search_kwargs={"k": 10})

# Initializing the FlashRank Reranker -->AI using
compressor = FlashrankRerank()

# Creating the Compression Retriever
rerank_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, 
    base_retriever=base_retriever
)

## Using Open_AI API KEY for Generation

In [None]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found. Check your .env file!")

llm = ChatOpenAI(model="gpt-4o", temperature=0)



In [None]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-4o", temperature=0) # gpt-4o is excellent for RAG

system_prompt = (
    "You are an expert assistant for geologic research. "
    "Use the provided context to answer the question concisely."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

combine_docs_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(rerank_retriever, combine_docs_chain)



# Testing Query

In [None]:
response = rag_chain.invoke({"input": "What are the long-term environmental impacts of volcanic eruptions?"})

print(f"Answer: {response['answer']}")