In [3]:
# index_creator.py
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle

# Load dataset
file_path = r"C:\Users\Dell\Downloads\CAI_RAG\DATA\Nestle_Financtial_report_till2023.xlsx"
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,Standalone Yearly Results,Dec '23,Dec '22,Dec '21,Dec '20,Dec '19
0,Net Sales/Income from operations,19126.3,16787.43,14633.72,13290.16,12295.27
1,Other Operating Income,--,77.63,75.69,59.87,73.63
2,Total Income From Operations,19126.3,16865.06,14709.41,13350.03,12368.9
3,EXPENDITURE,,,,,
4,Consumption of Raw Materials,8054.95,7652.11,6154.1,5554.24,5150.3


In [4]:

# Initialize SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Chunking mechanism: Sentence-based
sentences = []
index_map = {}


In [3]:

for index, row in df.iterrows():
    for col in df.columns[1:]:  # Assuming first column is a unique identifier
        text = f"{row[df.columns[0]]} - year  {col} is: {row[col]}"
        sentences.append(text)
        index_map[len(sentences) - 1] =text


# Generate embeddings
embeddings = model.encode(sentences, convert_to_numpy=True)

# Create FAISS index
dim = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(embeddings)

# Save index and mapping
faiss.write_index(faiss_index, "financial_faiss.index")
with open("index_map.pkl", "wb") as f:
    pickle.dump(index_map, f)

print("Indexing completed!")



Indexing completed!


In [7]:
# retriever.py
import faiss
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

# Load FAISS index and index map
faiss_index = faiss.read_index("financial_faiss.index")
with open("index_map.pkl", "rb") as f:
    index_map = pickle.load(f)

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

def query_faiss(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, top_k)
    results = []
    for idx in indices[0]:
        if idx in index_map:
            results.append(index_map[idx])
    
    return results

# Example usage
if __name__ == "__main__":
    query = "Total income is"
    results = query_faiss(query)
    print("Top results:", results)


Top results: ["Total Income From Operations - year  Dec '20 is: 13350.03", "Total Income From Operations - year  Dec '22 is: 16865.06", "Total Income From Operations - year  Dec '19 is: 12368.9", "Total Income From Operations - year  Dec '21 is: 14709.41", "Other Income - year  Dec '20 is: 145.85"]


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load Qwen model
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)

# Function to generate answers
def generate_answer(context, question):
    input_text = f"Context: {context}\nQuestion: {question}\nAnswer:"
    inputs = qwen_tokenizer.encode(input_text, return_tensors="pt")
    outputs = qwen_model.generate(inputs, max_length=100)
    return qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example: Answering a financial question
query = "What was the Net Sales/Income from operations in Dec'23?"
retrieved_docs = query_faiss(query)
context = " ".join(retrieved_docs)

answer = generate_answer(context, query)
print("💡 Answer:", answer)


: 

In [None]:
import faiss
import numpy as np
import pickle
import pandas as pd
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder

# Load FAISS index and index map
faiss_index = faiss.read_index("financial_faiss.index")
with open("index_map.pkl", "rb") as f:
    index_map = pickle.load(f)

# Load SBERT model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-en")  # For re-ranking

# Load financial dataset
df = pd.read_excel("/content/Nestle_Financtial_report_till2023.xlsx")

# Tokenize text for BM25
bm25_corpus = [index_map[i] for i in range(len(index_map))]
bm25_tokenized = [doc.split() for doc in bm25_corpus]
bm25 = BM25Okapi(bm25_tokenized)


def query_multi_stage_retrieval(query, top_k=5):
    """Performs multi-stage retrieval using BM25 + FAISS + Re-ranking"""

    # ---- Stage 1: BM25 Retrieval ----
    bm25_scores = bm25.get_scores(query.split())
    top_bm25_indices = np.argsort(bm25_scores)[::-1][:top_k * 2]  # Get top-2k for diversity
    bm25_results = [index_map[idx] for idx in top_bm25_indices]

    # ---- Stage 2: Embeddings-based Retrieval (FAISS) ----
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, faiss_indices = faiss_index.search(query_embedding, top_k * 2)  # Retrieve more candidates
    faiss_results = [index_map[idx] for idx in faiss_indices[0]]

    # Combine BM25 and FAISS results (for diversity)
    combined_results = list(set(bm25_results + faiss_results))[:top_k * 2]  # Ensure uniqueness

    # ---- Stage 3: Re-ranking with Cross-Encoder ----
    rerank_scores = cross_encoder.predict([(query, doc) for doc in combined_results])
    reranked_indices = np.argsort(rerank_scores)[::-1][:top_k]  # Top-K highest scores
    final_results = [combined_results[i] for i in reranked_indices]

    return final_results


# Example: Retrieve information
query = "What was the total revenue in 2023?"
results = query_multi_stage_retrieval(query)
print("Top results:", results)

5. Guard Rail Implementation	Implement one guardrail:
- Input-Side: Validate and filter user queries to prevent irrelevant/harmful inputs.
- Output-Side: Filter responses to remove hallucinated or misleading answers.
6. Testing & Validation	Ask 3 test questions:
- A relevant financial question (high-confidence).
- A relevant financial question (low-confidence).
- An irrelevant question (e.g., "What is the capital of France?") to check system robustness.


In [None]:
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from retriever import query_faiss

# Load Qwen model
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)

# List of restricted words (to block harmful or irrelevant queries)
RESTRICTED_KEYWORDS = ["hack", "attack", "cheat", "illegal", "personal data", "capital of", "politics"]

def validate_query(query):
    """Input-side guardrail: Validate and filter user queries."""
    query_lower = query.lower()
    for word in RESTRICTED_KEYWORDS:
        if word in query_lower:
            return False  # Block harmful or irrelevant queries
    return True

def filter_response(response):
    """Output-side guardrail: Remove hallucinated or misleading responses."""
    if "I am not sure" in response or "I do not know" in response:
        return "⚠️ The model is uncertain about this answer. Please refer to the official financial report."
    
    # Filter out unrealistic numbers or patterns (basic hallucination check)
    if re.search(r"\b\d{10,}\b", response):  
        return "⚠️ The generated answer contains an unrealistic value. Please verify with an official source."

    return response

def generate_answer(context, question):
    """Generate an answer using the Qwen model based on retrieved context."""
    if not validate_query(question):
        return "Query rejected: This question is either irrelevant or contains restricted content."

    input_text = f"Context: {context}\nQuestion: {question}\nAnswer:"
    inputs = qwen_tokenizer.encode(input_text, return_tensors="pt")
    outputs = qwen_model.generate(inputs, max_length=100)
    
    response = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return filter_response(response)

# Testing & Validation 
def test_system():
    """Run test cases to check system performance."""
    test_queries = [
        "what is Employees Cost for 2023?",  # High-confidence financial question
        "will Total Income From Operations increase this year?",  # Low-confidence financial question
        "What is the capital of France?"  # Irrelevant question
    ]

    for query in test_queries:
        retrieved_docs = query_faiss(query)
        context = " ".join(retrieved_docs) if retrieved_docs else "No relevant financial data found."

        answer = generate_answer(context, query)
        print(f" Query: {query}\n Answer: {answer}\n{'-'*50}")

# Run the test cases
if __name__ == "__main__":
    test_system()
