In [42]:
import json
import logging
import os
import sys
from pathlib import Path

from attr import dataclass

from haystack import Document
import numpy as np
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from pydantic import BaseModel
from sklearn.metrics.pairwise import cosine_similarity
from sympy import content
from transformers import pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory.embedding_retriever import InMemoryEmbeddingRetriever
from haystack.document_stores.types import DuplicatePolicy

EFS_PATH="."
doc_embedder = SentenceTransformersDocumentEmbedder()
doc_embedder.warm_up()
document_store = InMemoryDocumentStore()

@dataclass
class RequestData:
    id: str
    description: str 

pipe = pipeline("text2text-generation", model="deep-learning-analytics/automatic-title-generation", device="cuda")
def _generate_topic(document: str) -> list[dict[str, str]]:
  return pipe(document)

def _cluster_docs(request_data: list[dict], document_embeddings: dict[str, list[Document]], retriever: InMemoryEmbeddingRetriever) -> list:
    clusters = []
    visited_documents = set()

    for doc in document_embeddings["documents"]:
        if doc.meta["id"] in visited_documents:
            continue

        cluster = [doc.meta["id"]]
        visited_documents.add(doc.meta["id"])

        similar_documents = retriever.run(doc.embedding, top_k=50)["documents"]
        for similar_doc in similar_documents:
            if similar_doc.id not in visited_documents and similar_doc.score >= 0.7:
                cluster.append(similar_doc.id)
                visited_documents.add(similar_doc.id)

        if cluster:
            clusters.append(cluster)

    return clusters


def _generate_embeddings(request_data: list[dict], document_store:InMemoryDocumentStore) ->  tuple[dict[str, list[Document]], InMemoryEmbeddingRetriever]:
    documents = [Document(content=d["description"],meta={"id":d["id"]}) for d in request_data]
    document_store.write_documents(documents=documents)

    document_embeddings = doc_embedder.run(documents)
    retriever = InMemoryEmbeddingRetriever(document_store)
    return document_embeddings, retriever

def cluster_and_generate_topics(request_data: list[dict], document_embeddings: dict[str, list[Document]], retriever: InMemoryEmbeddingRetriever) -> list:

    clusters = _cluster_docs(request_data, document_embeddings, retriever)

    id_to_description = {request["id"]: request["description"] for request in request_data}

    clustered_topics = []
    for cluster_ids in clusters:
        cluster_documents = [id_to_description[request_id] for request_id in cluster_ids]
        joined_text = " ".join(cluster_documents)
        generated_topic = _generate_topic(joined_text)[0]["generated_text"]
        topic_data = {
            "results": [],
            "topic": generated_topic,
        }

        for request_id in cluster_ids:
          topic_data["results"].append({
              "request_id": request_id,
            #   "request_sequence": 0,
              "description": id_to_description[request_id],
          })
        clustered_topics.append(topic_data)

    filepath = Path(f"{EFS_PATH}/clustered_topics.json")
    with filepath.open(mode="w") as jsonfile:
        json.dump(clustered_topics, jsonfile)

    return clustered_topics


def find_similar_topic(
    document_store: InMemoryDocumentStore, new_request: dict[str, str]
) -> dict[str, str]:
    description = new_request["description"]
    request_id = new_request["id"]

    documents = [Document(content=description, meta={"id": request_id})]
    document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP)

    document_embeddings = doc_embedder.run(documents)

    request_embed = [document_embeddings["documents"][0].embedding]

    file_path = Path(f"{EFS_PATH}/clustered_topics.json")
    with file_path.open(mode="r") as jsondata:
        topic_data = json.loads(jsondata.read())

    max_similarity = -1
    max_topic = None
    topic_item = None

    for item in topic_data:
        request_doc = document_store.filter_documents(
            filters={
                "field": "meta.id",
                "operator": "==",
                "value": item["results"][0]["request_id"],
            }
        )
        embedding_vector = [request_doc[0].embedding]

        similarity = cosine_similarity(embedding_vector, request_embed)

        if similarity > 0.7 and similarity > max_similarity:
            max_similarity = similarity
            max_topic = item["topic"]
            topic_item = item

    if max_topic is None:
        max_topic = _generate_topic(description)[0]["generated_text"]
        topic_data.append(
            {
                "results": [
                    {
                        "request_id": request_id,
                        # "request_sequence": 0,
                        "description": description,
                    }
                ],
                "topic": max_topic,
            }
        )
    else:
        for item in topic_data:
            if (
                item["results"][0]["request_id"]
                == topic_item["results"][0]["request_id"]
            ):
                item["results"].append(
                    {
                        "request_id": request_id,
                        # "request_sequence": 0,
                        "description": description,
                    }
                )

    filepath = Path(f"{EFS_PATH}/clustered_topics.json")
    with filepath.open(mode="w") as jsonfile:
        json.dump(topic_data, jsonfile)

    return {
        "topic": max_topic,
    }

In [43]:
import pandas as pd

df = pd.read_excel("FOIALog_FY15.xlsx")

In [44]:
df = df.rename(columns={"Request ID":"id","Request Description ": "description"})

In [45]:
request_data_frame = df[["id","description"]]

In [46]:


request_data = request_data_frame.to_dict(orient="records")

In [47]:
embedings, retriever = _generate_embeddings(request_data=request_data,document_store=document_store)

Batches:   0%|          | 0/57 [00:00<?, ?it/s]

In [48]:
# cluster_and_generate_topics(request_data=request_data,document_embeddings=embedings, retriever)
clustered = _cluster_docs(request_data, embedings, retriever)

In [49]:
c=  cluster_and_generate_topics(request_data, embedings, retriever)

Token indices sequence length is longer than the specified maximum sequence length for this model (1726 > 512). Running this sequence through the model will result in indexing errors


In [None]:
c[0]

In [None]:
new_data = {"id": "0211",
            "description": "Requesting records relating to Department of Defense Instruction (DODI) 5240.02, Department of Defense Counterintelligence Program."}

In [None]:
find_similar_topic(document_store, new_data)

In [None]:
topic_json = Path("clustered_topics.json")
with topic_json.open(mode="r") as jsondata:
    topic_data = json.loads(jsondata.read())

In [None]:
topic_data[0]

In [None]:
from gliner import GLiNER

def extract_pii_entities(data: str) -> dict[str, list[dict[str, str]]]:

    model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1").to("cpu")

    labels = [
        "person",
        "age",
        "organization",
        "phone number",
        "address",
        "passport number",
        "email",
        "credit card number",
        "social security number",
        "health insurance id number",
        "date of birth",
        "mobile phone number",
        "bank account number",
        "social_security_number",
        "medication",
        "cpf",
        "tax identification number",
        "driver's license number",
        "medical condition",
        "identity card number",
        "national id number",
        "ip address",
        "email address",
        "credit card expiration date",
        "username",
        "health insurance number",
        "registration number",
        "student id number",
        "insurance number",
        "flight number",
        "landline phone number",
        "blood type",
        "cvv",
        "reservation number",
        "digital signature",
        "social media handle",
        "license plate number",
        "cnpj",
        "postal code",
        "passport_number",
        "serial number",
        "vehicle registration number",
        "credit card brand",
        "fax number",
        "visa number",
        "insurance company",
        "identity document number",
        "transaction number",
        "national health insurance number",
        "cvc",
        "birth certificate number",
        "train ticket number",
        "passport expiration date",
        "biometric data",
    ]

    entities = model.predict_entities(data, labels)

    return {
        "entities": [
            {"entity_group": entity["label"], "start": entity["start"], "end": entity["end"], "text": entity["text"]}
            for entity in entities
        ],
    }