# Proelections Concept Testing 

## 1. Import Libraries

In [1]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import requests
from typing import List

## 2. Sample HOA File Links and Questions

In [2]:
# Remote HOA file URLs
FILE_MAP = {
    "arroyo park": "https://raw.githubusercontent.com/Tom-Kinstle/RAG_project/main/rag_test/arroyo_park.txt",
    "camino place": "https://raw.githubusercontent.com/Tom-Kinstle/RAG_project/main/rag_test/camino_place.txt",
    "jackson oaks": "https://raw.githubusercontent.com/Tom-Kinstle/RAG_project/main/rag_test/jackson_oaks.txt"
}

# Compliance questions
QUESTIONS = [
    "How must HOA board elections be conducted under California law?",
    "What voting methods are required for board elections?",
    "What qualifications are required for board candidates?",
    "What notice requirements exist for board meetings?",
    "How are proxy votes handled in HOA elections?",
    "What constitutes a quorum for member meetings?",
    "How long do directors serve on the board?",
    "Under what circumstances can a director be removed?",
    "What are the assessment collection procedures?",
    "How are architectural review requests processed?",
    "What enforcement actions can the HOA take for violations?",
    "What are the requirements for amending CC&Rs or bylaws?"
]

## 3. Chunking Configuration

In [3]:
# Chunk configs
CHUNK_CONFIGS = {
    "default": {"size": 800, "overlap": 200},
    "small": {"size": 400, "overlap": 100},
    "large": {"size": 1200, "overlap": 300}
}

#### Sets up chunking strategies to slice text into overlapping segments for vectorization. Options vary in size and overlap, affecting the granularity of information retrieval.
#### Chunk size affects the granularity. Smaller chunks increase resolution but may lose context; larger chunks preserve more but might dilute precision.

#### Overlap helps preserve meaning that might be cut off at chunk boundaries ‚Äî especially useful for legal text where a single sentence may span two chunks.

## 4. Embedding Model Initialization

In [4]:
# E5-Base embedding setup
def setup_embedding_model(silent: bool = False):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if not silent:
        print(f"üñ•Ô∏è Using device: {device}")
    try:
        return HuggingFaceEmbeddings(
            model_name="intfloat/e5-base-v2",
            model_kwargs={"device": device},
            encode_kwargs={"normalize_embeddings": True, "batch_size": 32}
        )
    except Exception as e:
        print(f"Error loading E5: {e}")
        return None


#### Initializes our E5-base model using HuggingFace, selection GPU if available. 

#### Embeddings are the semantic engine (HuggingFace) of this project. They allow the system to "understand" text by mapping it into high-dimensional space where similar meanings are nearby (KNN), regardless of exact phrasing.

## 5. Document Cleaning and Chunking

In [5]:
# Document prep (clean & chunked)
def prepare_documents_enhanced(text: str, chunk_size: int, chunk_overlap: int) -> List[Document]:
    lines = text.split('\n')
    cleaned_lines = []
    skip_section = False

    for line in lines:
        if 'COMPLIANCE_QUESTIONS' in line or 'How must HOA board elections' in line:
            skip_section = True
            continue
        if skip_section and (line.strip() == '' or line.startswith('    "')):
            continue
        if skip_section and not line.startswith('    '):
            skip_section = False
        if not skip_section:
            cleaned_lines.append(line)

    cleaned_text = '\n'.join(cleaned_lines)
    if not cleaned_text.strip():
        return []

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n=== ", "\nARTICLE ", "\nSECTION ", "\n\n", "\n", ". ", " "],
        length_function=len,
        is_separator_regex=False,
    )

    raw_docs = splitter.create_documents([cleaned_text])
    enhanced_docs = []

    for i, doc in enumerate(raw_docs):
        content = doc.page_content.lower()
        metadata = {
            "chunk_id": i,
            "chunk_size": len(doc.page_content),
            "has_voting": any(t in content for t in ["vote", "ballot", "election", "poll", "voting", "electoral"]),
            "has_proxy": "proxy" in content,
            "has_director": any(t in content for t in ["director", "board", "officer", "president", "secretary", "treasurer"]),
            "has_quorum": "quorum" in content,
            "has_notice": any(t in content for t in ["notice", "notification", "notify", "inform"]),
            "has_meeting": any(t in content for t in ["meeting", "assembly", "session", "gathering"]),
        }
        enhanced_docs.append(Document(page_content=doc.page_content, metadata=metadata))

    return enhanced_docs

#### This stage cleans and structures legal documents for semantic search. It strips out repeated boilerplate and irrelevant text, splits the remainder into meaningful chunks using logical breakpoints, and tags each chunk with metadata like voting rules or proxy provisions.

## 6. Relevance Score Calculator

In [6]:
# Relevance scorer (sample logic)
def calculate_relevance_score(question: str, results: List[Document]) -> float:
    if not results:
        return 0.0
    q_words = set(question.lower().split())
    score = 0.0
    for doc in results:
        d_words = set(doc.page_content.lower().split())
        overlap = len(q_words & d_words) / len(q_words)
        score += overlap
    return score / len(results)

#### This function calculates how many words the retrieved chunk shares with the query, a basic check for lexical overlap.

## 7. HOA Document Query with FAISS

In [7]:
# FAISS query runner
def query_hoa_faiss_fixed(question_number: int, hoa_label: str, embed_model=None, chunk_config="default"):
    if embed_model is None:
        embed_model = setup_embedding_model(silent=True)

    hoa_key = hoa_label.lower()
    if hoa_key not in FILE_MAP:
        return {"error": f"Invalid HOA label: {hoa_label}"}
    if chunk_config not in CHUNK_CONFIGS:
        return {"error": f"Unknown chunk config: {chunk_config}"}

    question = QUESTIONS[question_number - 1]
    chunk_size = CHUNK_CONFIGS[chunk_config]["size"]
    chunk_overlap = CHUNK_CONFIGS[chunk_config]["overlap"]

    try:
        url = FILE_MAP[hoa_key]
        response = requests.get(url, timeout=30)
        response.raise_for_status()

        docs = prepare_documents_enhanced(response.text, chunk_size, chunk_overlap)
        if not docs:
            return {"error": "No content after cleaning and chunking."}

        vector_store = FAISS.from_documents(docs, embed_model)
        results_with_scores = vector_store.similarity_search_with_score(question, k=3)

        if not results_with_scores:
            return {"error": "No vector hits returned."}

        top_doc, vector_distance = results_with_scores[0]
        
        # Convert FAISS distance to similarity score (0-1, higher = better)
        # FAISS returns distance (lower = better), so we convert it
        semantic_score = max(0.0, 1.0 - vector_distance)
        
        # Keep lexical as bonus info
        lexical_score = calculate_relevance_score(question, [top_doc])
        
        answer = top_doc.page_content.strip()

        return {
            "answer": answer,
            "score": semantic_score,  # Use semantic similarity as main score
            "vector_distance": vector_distance,  # Raw FAISS distance
            "lexical_score": lexical_score  # Word overlap for reference
        }
    except Exception as e:
        return {"error": str(e)}

#### The RAG loop kicks off by fetching a remote HOA document, then cleaning and chunking it into manageable pieces. Each chunk is embedded and stored in a FAISS index for fast retrieval. When a query arrives, it searches for the most similar chunks and returns the top match along with a relevance score.

#### This pipeline wraps ingestion, semantic search, and QA into a streamlined flow. FAISS makes it fast and scalable‚Äîideal for real-time compliance tasks.

## 8. Fast Wrapper for Queries

In [8]:
# Quick wrapper
def quick_query_fixed(question_number: int, hoa_label: str, chunk_config="default", embed_model=None):
    return query_hoa_faiss_fixed(question_number, hoa_label, embed_model, chunk_config)

#### This helper function streamlines FAISS pipeline execution by forwarding inputs to query_hoa_faiss, enabling rapid prototyping and smooth batch testing.

## 9. Display Query Features

In [9]:
print("üéØ HOA RAG Query System Ready!")

# üìù Available Questions
print("\nüìã Available Questions (1-12):")
for i, q in enumerate(QUESTIONS, 1):
    print(f" {i:2d}. {q}")

# üìÅ HOA Files
print("\nüìÅ Available HOA Files:")
for hoa in FILE_MAP.keys():
    print(f"   ‚Ä¢ {hoa}")

# ‚öôÔ∏è Chunk Configurations
print("\n‚öôÔ∏è Available Chunk Configs:")
for name, config in CHUNK_CONFIGS.items():
    print(f"   ‚Ä¢ {name:<8} ‚Üí size={config['size']} chars, overlap={config['overlap']}")


üéØ HOA RAG Query System Ready!

üìã Available Questions (1-12):
  1. How must HOA board elections be conducted under California law?
  2. What voting methods are required for board elections?
  3. What qualifications are required for board candidates?
  4. What notice requirements exist for board meetings?
  5. How are proxy votes handled in HOA elections?
  6. What constitutes a quorum for member meetings?
  7. How long do directors serve on the board?
  8. Under what circumstances can a director be removed?
  9. What are the assessment collection procedures?
 10. How are architectural review requests processed?
 11. What enforcement actions can the HOA take for violations?
 12. What are the requirements for amending CC&Rs or bylaws?

üìÅ Available HOA Files:
   ‚Ä¢ arroyo park
   ‚Ä¢ camino place
   ‚Ä¢ jackson oaks

‚öôÔ∏è Available Chunk Configs:
   ‚Ä¢ default  ‚Üí size=800 chars, overlap=200
   ‚Ä¢ small    ‚Üí size=400 chars, overlap=100
   ‚Ä¢ large    ‚Üí size=1200 chars, 

## 10. Run Example Queries and Print Results

In [10]:
# Test with the fixed scoring
embed_model = setup_embedding_model(silent=True)
result = quick_query_fixed(3, "arroyo park", "small", embed_model)

print(f"Semantic Score: {result['score']:.3f}")  # Should be high again
print(f"Vector Distance: {result['vector_distance']:.3f}")  # Raw FAISS distance  
print(f"Lexical Score: {result['lexical_score']:.3f}")  # Word overlap
print(f"Answer: {result['answer'][:200]}...")

Semantic Score: 0.785
Vector Distance: 0.215
Lexical Score: 0.286
Answer: 5.2 Qualifications for Candidates. Candidates for the Board: (i) must be Members in Good Standing, or, in the case of a Member in Good Standing that is an entity, an officer, director, principal, or a...


#### This final test evaluates the RAG pipeline by retrieving an answer to one of the predetermined qeustion selections. The output includes three key metrics: a semantic score (0.785) indicating strong alignment between the query and the returned text based on shared meaning; a vector Euclidean distance (0.215) from FAISS, showing high similarity in embedding space; and a lexical score (0.286), which reflects moderate word overlap. Together, these metrics confirm that the model retrieved a semantically relevant answer, even if some phrasing differs from the original question‚Äîdemonstrating the strength of embedding-based search over simple keyword matching.