In [3]:
# index_creator.py
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle

# Load dataset
file_path = r"C:\Users\Dell\Downloads\CAI_RAG\DATA\Nestle_Financtial_report_till2023.xlsx"
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,Standalone Yearly Results,Dec '23,Dec '22,Dec '21,Dec '20,Dec '19
0,Net Sales/Income from operations,19126.3,16787.43,14633.72,13290.16,12295.27
1,Other Operating Income,--,77.63,75.69,59.87,73.63
2,Total Income From Operations,19126.3,16865.06,14709.41,13350.03,12368.9
3,EXPENDITURE,,,,,
4,Consumption of Raw Materials,8054.95,7652.11,6154.1,5554.24,5150.3


In [4]:

# Initialize SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Chunking mechanism: Sentence-based
sentences = []
index_map = {}


In [3]:

for index, row in df.iterrows():
    for col in df.columns[1:]:  # Assuming first column is a unique identifier
        text = f"{row[df.columns[0]]} - year  {col} is: {row[col]}"
        sentences.append(text)
        index_map[len(sentences) - 1] =text


# Generate embeddings
embeddings = model.encode(sentences, convert_to_numpy=True)

# Create FAISS index
dim = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(embeddings)

# Save index and mapping
faiss.write_index(faiss_index, "financial_faiss.index")
with open("index_map.pkl", "wb") as f:
    pickle.dump(index_map, f)

print("Indexing completed!")



Indexing completed!


In [7]:
# retriever.py
import faiss
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

# Load FAISS index and index map
faiss_index = faiss.read_index("financial_faiss.index")
with open("index_map.pkl", "rb") as f:
    index_map = pickle.load(f)

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

def query_faiss(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, top_k)
    results = []
    for idx in indices[0]:
        if idx in index_map:
            results.append(index_map[idx])
    
    return results

# Example usage
if __name__ == "__main__":
    query = "Total income is"
    results = query_faiss(query)
    print("Top results:", results)


Top results: ["Total Income From Operations - year  Dec '20 is: 13350.03", "Total Income From Operations - year  Dec '22 is: 16865.06", "Total Income From Operations - year  Dec '19 is: 12368.9", "Total Income From Operations - year  Dec '21 is: 14709.41", "Other Income - year  Dec '20 is: 145.85"]


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load Qwen model
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)

# Function to generate answers
def generate_answer(context, question):
    input_text = f"Context: {context}\nQuestion: {question}\nAnswer:"
    inputs = qwen_tokenizer.encode(input_text, return_tensors="pt")
    outputs = qwen_model.generate(inputs, max_length=100)
    return qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example: Answering a financial question
query = "What was the Net Sales/Income from operations in Dec'23?"
retrieved_docs = query_faiss(query)
context = " ".join(retrieved_docs)

answer = generate_answer(context, query)
print("💡 Answer:", answer)


: 

In [None]:
import re
from transformers import pipeline

# ---- Guardrail Implementation ----
class FinancialGuardrails:
    def __init__(self):
        # Initialize classifiers
        self.relevance_classifier = pipeline(
            "text-classification", 
            model="cross-encoder/nli-deberta-v3-base"
        )
        self.harm_classifier = pipeline(
            "text-classification",
            model="unitary/unbiased-toxic-roberta"
        )
        
        # Financial keywords and patterns
        self.financial_keywords = {
            'income', 'revenue', 'profit', 'cost', 'operations',
            'sales', 'expenses', 'assets', 'liabilities', 'equity',
            'cash flow', 'ebitda', 'year', 'quarter', 'financial'
        }
        self.metric_pattern = re.compile(r"\d{4}|\d+\.\d{2}|million|billion", re.IGNORECASE)

    def validate_query(self, query):
        """Input-side validation"""
        # Harm check
        harm_result = self.harm_classifier(query)[0]
        if harm_result['label'] == 'toxic' and harm_result['score'] > 0.7:
            return "Invalid: Harmful content detected"
            
        # Relevance check
        query_lower = query.lower()
        
        # 1. Keyword check
        keyword_match = any(kw in query_lower for kw in self.financial_keywords)
        
        # 2. Semantic similarity check
        relevance_score = self.relevance_classifier(
            sequence=query,
            candidate_labels=["financial report analysis", "general knowledge", "other"]
        )['scores'][0]
        
        if not keyword_match and relevance_score < 0.6:
            return "Invalid: Query not financial-related"
            
        return "Valid: Query accepted"

    def validate_response(self, query, response):
        """Output-side validation"""
        # Check for numerical consistency
        if self.metric_pattern.search(query) and not self.metric_pattern.search(response):
            return "[Filtered] Numerical response required"
            
        # Check for hallucination patterns
        hallucination_triggers = {
            'irrelevant', 'cannot answer', 'not provide', 'unknown',
            'not specified', 'not mentioned'
        }
        if any(ht in response.lower() for ht in hallucination_triggers):
            return "[Filtered] Potential hallucination detected"
            
        return response

# ---- Testing & Validation ----
if __name__ == "__main__":
    guardrails = FinancialGuardrails()
    
    test_queries = [
        "What is Employees Cost for 2023?",  # High-confidence finance
        "Will Total Income From Operations increase this year?",  # Low-confidence finance
        "What is the capital of France?",  # Irrelevant
        "How can I commit fraud in stock trading?"  # Harmful
    ]
    
    test_responses = [
        "1849.18",
        "Yes it may crease looking at the treand throughout the years.",
        "Paris. But irrelevant.",
        "We cannot provide information on illegal activities."
    ]
    
    for i, (query, response) in enumerate(zip(test_queries, test_responses)):
        print(f"Test Case {i+1}:")
        print(f"Query: {query}")
        
        # Input validation
        validation = guardrails.validate_query(query)
        print(f"Validation: {validation}")
        
        if "Valid" in validation:
            # Simulate retrieval and generation
            filtered_res = guardrails.validate_response(query, response)
            print(f"Original Response: {response}")
            print(f"Filtered Response: {filtered_res}")
            
        print("-"*50)