# INSTALLING REQUIRED LIBRARIES


In [None]:
# Install required libraries
!pip install langchain langchain_community sentence-transformers chromadb

!pip uninstall fitz -y
!pip install pymupdf
!pip install re

# DIVIDING DATA INTO CHUNKS

In [None]:
import os
import shutil
import pymupdf
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# Paths
CHROMA_PATH = "" # Define path to where to save data after divinding in chunks
PDF_PATH = ""  # Define path to your data

# Section Header Patterns
SECTION_PATTERN = r"(Champions Trophy Overview|ICC Champions Trophy 2025 – Schedule|Champions Trophy Winners List|All-Time Batting Stats|All-Time Bowling Stats|Most Runs in Champions Trophy|Most Wickets in Champions Trophy|Highest Team Totals|Champions Trophy 2025 Squads)"

def main():
    generate_data_store()

def generate_data_store():
    documents = load_pdf(PDF_PATH)
    chunks = split_text(documents)
    save_to_chroma(chunks)

def load_pdf(file_path: str):
    documents = []
    with pymupdf.open(file_path) as pdf_doc:
        text_content = ""
        for page in pdf_doc:
            text_content += page.get_text("text") + "\n"

        # Split based on section headers
        sections = re.split(SECTION_PATTERN, text_content)
        headers = re.findall(SECTION_PATTERN, text_content)

        for i, section in enumerate(sections):
            cleaned_section = section.strip()
            if cleaned_section:
                section_title = headers[i-1] if i > 0 and i <= len(headers) else "Unknown Section"
                doc = Document(
                    page_content=cleaned_section,
                    metadata={"filename": os.path.basename(file_path), "section": section_title}
                )
                documents.append(doc)

    return documents

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,  # Increased chunk size for better retrieval
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} sections into {len(chunks)} chunks.")
    return chunks

def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
    db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

if __name__ == "__main__":
    main()


# RUNNING INFERENCE ON DATA USING MISTRAL MODEL

In [None]:
import os
import json
from huggingface_hub import InferenceClient
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate

# Define path
CHROMA_PATH = ""

# Hugging Face API Key (Replace with your actual key)
HF_API_KEY = ""

client = InferenceClient(model="mistralai/Mistral-Small-24B-Instruct-2501", token=HF_API_KEY)

# Define prompt template
PROMPT_TEMPLATE = """
Answer the question based only on the following context. If the answer is not present in the context, answer based on your own knowledge.

{context}

---

Answer the question: {question}
"""

def query_database(query_text):
    # Use BAAI/bge-small-en embeddings (Local, No API Required)
    embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search for relevant chunks
    results = db.similarity_search_with_relevance_scores(query_text, k=10)

    if len(results) == 0 or results[0][1] < 0.7:
        context_text = "No relevant context found."
    else:
        context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Prepare the request for the model
    payload = {
        "inputs": prompt,
        "parameters": {"max_tokens": 500}
    }

    response = client.post(json=payload)

    # Extract response safely
    try:
        response_data = json.loads(response)
        response_text = response_data[0]['generated_text'] if isinstance(response_data, list) else response_data['generated_text']
    except (json.JSONDecodeError, KeyError, IndexError):
        response_text = "❌ Error generating response."

    sources = [doc.metadata.get("source", "Unknown Source") for doc, _score in results]
    formatted_response = f"Response: {response_text}\n\n\nSources: {sources}"
    print(formatted_response)

if __name__ == "__main__":
    query = input("🔎 Enter your query: ")
    query_database(query)

# RUNNING INFERENCE ON DATA USING GEMMA MODEL



In [None]:
import os
import json
from huggingface_hub import InferenceClient
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# Define path
CHROMA_PATH = ""

# Hugging Face API Key (Replace with your actual key)
HF_API_KEY = ""

# Use supported chat model
HF_MODEL = "google/gemma-2-9b-it"

client = InferenceClient(provider="together", api_key=HF_API_KEY)

# Define prompt template
PROMPT_TEMPLATE = """
You are an expert cricket analyst. Answer the user's question **only** using the given context.
If the answer is **not in the context**, say: "The answer is not available in the provided information."

---

Context:
{context}

---

Question: {question}
"""

def query_database(query_text, top_k=10, min_relevance=0.7):
    # Load the embedding model
    embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search for relevant chunks
    results = db.similarity_search_with_relevance_scores(query_text, k=top_k)

    filtered_results = [doc for doc, score in results if score >= min_relevance]

    if not filtered_results:
        context_text = "No relevant context found."
    else:
        context_text = "\n\n---\n\n".join([doc.page_content for doc in filtered_results])

    # Prepare chat messages
    messages = [
        {"role": "system", "content": "You are a helpful cricket analyst AI."},
        {"role": "user", "content": PROMPT_TEMPLATE.format(context=context_text, question=query_text)}
    ]

    try:
        completion = client.chat.completions.create(
            model=HF_MODEL,
            messages=messages,
            max_tokens=500
        )

        # Extract response safely
        response_text = completion.choices[0].message.content if completion.choices else "❌ Error: No response."

    except Exception as e:
        response_text = f"❌ API Error: {str(e)}"


    sources = [doc.metadata.get("source", "Unknown Source") for doc in filtered_results]
    formatted_response = f"📌 **Response:**\n{response_text}\n\n📚 **Sources:** {', '.join(set(sources))}"

    print(formatted_response)

if __name__ == "__main__":
    query = input("🔎 Enter your query: ")
    query_database(query)

