In [86]:
import os
import faiss
import numpy as np
from tqdm import tqdm
import json
from huggingface_hub import InferenceClient
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util

In [69]:
# Replace HuggingFace Token
os.environ["HF_TOKEN"] = "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"

In [70]:
# Load PDF of Your Choosing
loader = PyPDFLoader("/content/US_Gov_Report.pdf")
docs = loader.load()
print(len(docs))

147


In [71]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

In [72]:
text_lines = [chunk.page_content for chunk in chunks]

In [73]:
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

def emb_text(text):
    return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]

In [74]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(f"Embedding dimension: {embedding_dim}")
print(f"Sample embedding: {test_embedding[:10]}")


Embedding dimension: 384
Sample embedding: [-0.07660680264234543, 0.025316733866930008, 0.012505539692938328, 0.004595177713781595, 0.025780005380511284, 0.038167089223861694, 0.08050810545682907, 0.00303537561558187, 0.02439219132065773, 0.004880349617451429]


In [75]:
index = faiss.IndexFlatIP(embedding_dim)
data = []

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    embedding = emb_text(line)
    embedding_np = np.array(embedding).astype('float32')
    if embedding_np.shape[0] != embedding_dim:
        raise ValueError(f"Embedding has incorrect shape {embedding_np.shape}, expected {embedding_dim}")

    # Add the embedding to the Faiss index
    index.add(np.array([embedding_np]))

    # Store the text and corresponding embedding
    data.append({"id": i, "vector": embedding, "text": line})

print(f"Added {len(data)} embeddings to the Faiss index.")


Creating embeddings: 100%|██████████| 560/560 [02:53<00:00,  3.22it/s]

Added 560 embeddings to the Faiss index.





In [76]:
question = "How is the USA Economy Based on the Report? Is It thriving"
question_embedding = emb_text(question)
question_embedding_np = np.array([question_embedding]).astype('float32')


In [77]:
D, I = index.search(question_embedding_np, 3)
retrieved_lines_with_distances = [
    (data[idx]["text"], float(D[0][i])) for i, idx in enumerate(I[0])
]

In [78]:
print(json.dumps(retrieved_lines_with_distances, indent=4))

# Build context from retrieved lines for the final prompt
context = "\n".join([line_with_distance[0] for line_with_distance in retrieved_lines_with_distances])

PROMPT = """
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

[
    [
        "U.S. government is the largest, most diverse, most complex, and arguably the most \nimportant entity on earth today. Its services\u2014ho meland security, national defense, Social \nSecurity, mail delivery, and food inspection, to name a few\u2014directly affect the well-\nbeing of almost every American. But sound decisions on the future direction of vital \ngovernment programs and policies are made more difficult without timely, accurate, and \nuseful financial and performance information. \nUntil the problems discussed in our report are adequately addressed, they will continue to \n(1) hamper the federal government\u2019s ability to accurately report a significant portion of \nits assets, liabilities, and costs; (2) affect the federal government\u2019s ability to accurately \nmeasure the full cost as well as the financial and nonfinancial performance of certain \nprograms while effectively managing related operations; and (3) significantly impair the",
        0.7334

In [79]:
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
llm_client = InferenceClient(model=repo_id, timeout=120)

prompt = PROMPT.format(context=context, question=question)

answer = llm_client.text_generation(prompt, max_new_tokens=1000).strip()
# print(f"Answer:\n{answer}")

def calculate_llm_confidence(response):
    """Simple method to estimate LLM confidence"""
    keywords = ['yes', 'no','thriving', 'recession', 'growing']
    matches = sum(1 for word in keywords if word in response.lower())
    confidence = matches / len(keywords)
    return round(confidence, 4)

llm_confidence = calculate_llm_confidence(answer)
print(f"LLM Response:\n{answer}")
print(f"LLM Confidence Score: {llm_confidence}")

LLM Response:
The U.S. economy is strong and growing, as indicated by the accelerated growth in real GDP and productivity, as well as the stabilization and increase in employment in fiscal year 2003. However, the report does not explicitly state that the economy is "thriving." The financial report highlights the challenges faced by the U.S. government in accurately reporting its financial position, assets, liabilities, and costs, which may affect future operations. Additionally, the report mentions the impact of new fiscal policies in 2003 on the improvement of the economy, but it does not provide a detailed analysis of the long-term sustainability of this growth.
LLM Confidence Score: 0.6


In [87]:
def calculate_semantic_similarity(text1, text2):
    """
    Calculates cosine similarity between two texts using embeddings.
    """
    embedding1 = embedding_model.encode(text1, convert_to_tensor=True)
    embedding2 = embedding_model.encode(text2, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    return round(similarity, 4)

# Perform evaluation
query_similarity = calculate_semantic_similarity(answer, question)
context_similarity = calculate_semantic_similarity(answer, context)

# Weighted final confidence score
weight_query_similarity = 0.5
weight_context_similarity = 0.5
final_confidence = round(
    weight_query_similarity * query_similarity +
    weight_context_similarity * context_similarity,
    4
)

# Print evaluation results
print(f"Query Similarity: {query_similarity}")
print(f"Context Similarity: {context_similarity}")
print(f"Final Confidence Score: {final_confidence}")

Query Similarity: 0.8278
Context Similarity: 0.829
Final Confidence Score: 0.8284
