In [42]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
import re
import json

In [43]:
def load_articles(file_path = 'articles.json'):
    with open(file_path, 'r', encoding='utf-8') as f:
        articles = json.load(f)
    return articles

In [None]:
from langchain_ollama import OllamaEmbeddings

embed = OllamaEmbeddings(
    model="nomic-embed-text:latest"
)

In [47]:
#testing 
def extract_query_components(query):
    # Normalize query
    query = query.lower().strip()
    query = re.sub(r"[^\w\s]", "", query)  # remove punctuation

    words = query.split()
    part = None
    article_num = None
    tags = []

    # Capture article number if present
    for i, word in enumerate(words):
        if word == "article" and i + 1 < len(words):
            article_num = words[i + 1].upper()

        elif word == "part" and i + 1 < len(words):
            part_name = words[i + 1].upper()
            if not part_name.startswith("PART"):
                part = "Part " + part_name
            else:
                part = part_name

    # Basic stopword filtering
    stopwords = {"what", "are", "is", "in", "of", "the", "mentioned", "and", "to", "a", "an", "on", "under", "with"}

    for word in words:
        if word not in stopwords and word != "article" and word != "part":
            # Filter out numeric-like strings and previously extracted elements
            if word.upper() != article_num and not word.upper().startswith("PART"):
                tags.append(word)

    return {
        "tags": tags,
        "part": part,
        "article_num": article_num
    }

In [48]:
def build_tag_embeddings(articles):
    tag_embeddings = []
    for art in articles:
        tag_string = " ".join(art['metadata'].get("tags", []))
        if tag_string.strip():
            embedding = embed.embed_query(tag_string)
            tag_embeddings.append((embedding, art))
    return tag_embeddings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def filter_articles(user_query, tag_index, threshold=0.6):
    # Extract structured components from the query
    query_data = extract_query_components(user_query)
    query_tags = query_data['tags']
    part = query_data['part']
    article_num = query_data['article_num']

    PART_MATCH_WEIGHT = 1.2
    ARTICLE_MATCH_WEIGHT = 1.4

    results = []

    # Embed user query tags
    query_vector = embed.embed_query(" ".join(query_tags))
    qv = np.array(query_vector).reshape(1, -1)
    
    for tag_vector, article in tag_index:
        # 🧪 Debug: Check types and shapes
        tv = np.array(tag_vector).reshape(1, -1)

        # Try converting to lists if needed
        try:
            similarity = cosine_similarity(qv, tv)[0][0]
        except Exception as e:
            print("Similarity calculation failed:", e)
            continue

        score = similarity
        # Boost score for part and article_num match
        if part and part == article["metadata"].get("part"):
            score *= PART_MATCH_WEIGHT
        if article_num and article_num == article["metadata"].get("article"):
            score *= ARTICLE_MATCH_WEIGHT

        if score >= threshold:
            results.append((score, article))

    return sorted(results, key=lambda x: x[0], reverse=True)


In [50]:
def display_top_articles(results, top_n=5):
    for i, (score, art) in enumerate(results[:top_n]):
        print(f"\nRank {i+1} | Score: {score:.2f}")
        print(f"Article {art['metadata'].get('article')} | Part: {art['metadata'].get('part')}")
        print("Text:", art['text'])

In [None]:
import pickle 

def load_tag_index(path="tag_index.pkl"):
    with open(path, "rb") as f:
        tag_index = pickle.load(f)
    print(f"Tag index loaded from {path}")
    return tag_index


In [None]:
query = "What are the rights and duties mentioned in Article 243A of Part IXA?"

tag_index = load_tag_index()
match = filter_articles(query, tag_index, threshold=0.6)


In [None]:
# import pickle 
# # run once
# def save_tag_index(tag_index, path="tag_index.pkl"):
#     with open(path, "wb") as f:
#         pickle.dump(tag_index, f)
#     print(f"Tag index saved to {path}")

In [60]:
save_tag_index(tag_index)

Tag index saved to tag_index.pkl
