In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext
from tkinter import ttk
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from datetime import datetime
import re
import threading
from sentence_transformers import SentenceTransformer, util
import ollama
from openpyxl import Workbook

# ------------------------------
# Helper Functions
# ------------------------------

def extract_records(doc_path):
    doc = Document(doc_path)

    full_text = "\n".join(
        para.text.strip()
        for para in doc.paragraphs
        if para.text.strip()
    )

    full_text = full_text.replace("—", "-").replace("–", "-")

    record_splits = re.split(
        r'(?im)(?=^\s*record\s+\d+\b)',
        full_text
    )

    records = []
    for chunk in record_splits:
        if re.match(r'(?im)^\s*record\s+\d+\b', chunk):
            records.append(chunk.strip())

    return records


def extract_sections(record_text):
    mitigation = re.findall(r'Mitigation[:\s-]*(.*?)(?=\n\d+\.|$)', record_text, re.DOTALL | re.IGNORECASE)
    prevention = re.findall(r'Prevention[:\s-]*(.*?)(?=\n\d+\.|$)', record_text, re.DOTALL | re.IGNORECASE)
    resources = re.findall(r'(NIST|HIPAA|GDPR|HHS)', record_text, re.IGNORECASE)

    return {
        "mitigation": "\n".join(mitigation).strip(),
        "prevention": "\n".join(prevention).strip(),
        "resources": ", ".join(set(resources))
    }


def get_llm_insight(section, text1, text2):
    if not text1 and not text2:
        return "Both documents have no content for this section."
    elif not text1:
        return "Only Document 2 has content in this section."
    elif not text2:
        return "Only Document 1 has content in this section."

    prompt = f"""
Compare the following two {section} sections.
Explain similarities and differences. Highlight common themes and missing elements.

Document 1 {section}:
{text1}

Document 2 {section}:
{text2}
"""

    try:
        response = ollama.chat(
            model="llama3",
            messages=[{"role": "user", "content": prompt}]
        )
        return response["message"]["content"].strip()
    except Exception as e:
        return f"LLM insight unavailable: {e}"


# ------------------------------
# GUI Application
# ------------------------------

class BreachComparerApp(tk.Tk):

    def __init__(self):
        super().__init__()
        self.title("Healthcare Breach Semantic Comparator with Insights")
        self.geometry("1350x820")

        self.doc1_records = []
        self.doc2_records = []

        self.comparison_results = []
        self.overall_averages = {}

        self.model = None

        self.setup_ui()
        threading.Thread(target=self.load_model, daemon=True).start()

    def load_model(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def semantic_similarity(self, text1, text2):
        if not text1 or not text2 or self.model is None:
            return 0.0

        emb1 = self.model.encode(text1, convert_to_tensor=True)
        emb2 = self.model.encode(text2, convert_to_tensor=True)
        sim = util.cos_sim(emb1, emb2).item()
        return round(sim * 100, 2)

    # ---------------- UI ----------------

    def setup_ui(self):

        self.btn_load1 = tk.Button(self, text="Load Document 1", command=self.load_doc1)
        self.btn_load1.grid(row=0, column=0, padx=5, pady=5)

        self.doc1_label = tk.Label(self, text="No file loaded")
        self.doc1_label.grid(row=0, column=1, sticky='w')

        self.btn_load2 = tk.Button(self, text="Load Document 2", command=self.load_doc2)
        self.btn_load2.grid(row=1, column=0, padx=5, pady=5)

        self.doc2_label = tk.Label(self, text="No file loaded")
        self.doc2_label.grid(row=1, column=1, sticky='w')

        self.btn_compare = tk.Button(self, text="Compare Records", command=self.start_comparison_thread)
        self.btn_compare.grid(row=2, column=0, pady=10)

        self.btn_export_excel = tk.Button(self, text="Download Excel Report", command=self.export_excel)
        self.btn_export_excel.grid(row=2, column=1, pady=10)

        self.btn_export_word = tk.Button(self, text="Download Word Report", command=self.export_word_report)
        self.btn_export_word.grid(row=2, column=2, pady=10)

        self.progress_label = tk.Label(self, text="Ready")
        self.progress_label.grid(row=2, column=3, padx=10)

        self.progress_bar = ttk.Progressbar(self, length=300, mode='determinate')
        self.progress_bar.grid(row=2, column=4, padx=10)

        self.result_text = scrolledtext.ScrolledText(self, width=160, height=18, wrap=tk.WORD)
        self.result_text.grid(row=3, column=0, columnspan=5, padx=10, pady=10)

        self.tree = ttk.Treeview(
            self,
            columns=("Record", "Mitigation", "Prevention", "Resources", "Overall"),
            show="headings"
        )

        for col in self.tree["columns"]:
            self.tree.heading(col, text=col)

        self.tree.grid(row=4, column=0, columnspan=5, padx=10, pady=10)

        self.overall_table = ttk.Treeview(
            self,
            columns=("Section", "Average Similarity"),
            show="headings"
        )

        self.overall_table.heading("Section", text="Section")
        self.overall_table.heading("Average Similarity", text="Average Similarity (%)")

        self.overall_table.grid(row=5, column=0, columnspan=5, padx=10, pady=10)

    # ---------------- File Load ----------------

    def load_doc1(self):
        path = filedialog.askopenfilename(filetypes=[("Word Documents", "*.docx")])
        if path:
            self.doc1_records = extract_records(path)
            self.doc1_label.config(text=f"{len(self.doc1_records)} records loaded")

    def load_doc2(self):
        path = filedialog.askopenfilename(filetypes=[("Word Documents", "*.docx")])
        if path:
            self.doc2_records = extract_records(path)
            self.doc2_label.config(text=f"{len(self.doc2_records)} records loaded")

    # ---------------- Comparison ----------------

    def start_comparison_thread(self):

        if not self.doc1_records or not self.doc2_records:
            messagebox.showwarning("Warning", "Please load both documents first.")
            return

        self.result_text.delete(1.0, tk.END)
        self.tree.delete(*self.tree.get_children())
        self.overall_table.delete(*self.overall_table.get_children())
        self.comparison_results.clear()

        total_records = max(len(self.doc1_records), len(self.doc2_records))

        self.progress_bar["value"] = 0
        self.progress_bar["maximum"] = total_records
        self.progress_label.config(text=f"Processing 0 of {total_records}")

        self.btn_compare.config(state="disabled")
        self.btn_export_excel.config(state="disabled")
        self.btn_export_word.config(state="disabled")

        threading.Thread(target=self.compare_records, daemon=True).start()

    def update_progress(self, current, total):
        self.progress_bar["value"] = current
        self.progress_label.config(text=f"Processing {current} of {total}")

        if current == total:
            self.progress_label.config(text="Processing Complete")
            self.btn_compare.config(state="normal")
            self.btn_export_excel.config(state="normal")
            self.btn_export_word.config(state="normal")

    def compare_records(self):

        total_records = max(len(self.doc1_records), len(self.doc2_records))

        mitigation_scores = []
        prevention_scores = []
        resource_scores = []

        for i in range(total_records):

            rec1 = self.doc1_records[i] if i < len(self.doc1_records) else ""
            rec2 = self.doc2_records[i] if i < len(self.doc2_records) else ""

            sec1 = extract_sections(rec1)
            sec2 = extract_sections(rec2)

            mitigation_sim = self.semantic_similarity(sec1['mitigation'], sec2['mitigation'])
            prevention_sim = self.semantic_similarity(sec1['prevention'], sec2['prevention'])
            resources_sim = self.semantic_similarity(sec1['resources'], sec2['resources'])

            valid = [s for s in [mitigation_sim, prevention_sim, resources_sim] if s > 0]
            overall_sim = round(sum(valid)/len(valid), 2) if valid else 0

            mitigation_scores.append(mitigation_sim)
            prevention_scores.append(prevention_sim)
            resource_scores.append(resources_sim)

            mitigation_insight = get_llm_insight("Mitigation", sec1['mitigation'], sec2['mitigation'])
            prevention_insight = get_llm_insight("Prevention", sec1['prevention'], sec2['prevention'])
            resources_insight = get_llm_insight("Resources", sec1['resources'], sec2['resources'])

            self.comparison_results.append({
                "record": i+1,
                "mitigation": mitigation_sim,
                "prevention": prevention_sim,
                "resources": resources_sim,
                "overall": overall_sim,
                "mitigation_insight": mitigation_insight,
                "prevention_insight": prevention_insight,
                "resources_insight": resources_insight
            })

            self.tree.insert("", "end",
                             values=(f"Record {i+1}", mitigation_sim,
                                     prevention_sim, resources_sim, overall_sim))

            self.result_text.insert(tk.END,
                f"=== Record {i+1} ===\n"
                f"Mitigation ({mitigation_sim}%):\n{mitigation_insight}\n\n"
                f"Prevention ({prevention_sim}%):\n{prevention_insight}\n\n"
                f"Resources ({resources_sim}%):\n{resources_insight}\n\n"
                f"Overall Similarity: {overall_sim}%\n"
                + "="*100 + "\n\n"
            )

            self.after(0, self.update_progress, i+1, total_records)

        avg_mit = round(sum(mitigation_scores)/len(mitigation_scores),2) if mitigation_scores else 0
        avg_prev = round(sum(prevention_scores)/len(prevention_scores),2) if prevention_scores else 0
        avg_res = round(sum(resource_scores)/len(resource_scores),2) if resource_scores else 0
        avg_overall = round((avg_mit+avg_prev+avg_res)/3,2)

        self.overall_averages = {
            "Mitigation": avg_mit,
            "Prevention": avg_prev,
            "Resources": avg_res,
            "Overall": avg_overall
        }

        for k,v in self.overall_averages.items():
            self.overall_table.insert("", "end", values=(k, v))

    # ---------------- Excel Export ----------------

    def export_excel(self):

        if not self.comparison_results:
            messagebox.showwarning("Warning", "No results to export.")
            return

        file_path = filedialog.asksaveasfilename(
            defaultextension=".xlsx",
            filetypes=[("Excel Files", "*.xlsx")]
        )

        if not file_path:
            return

        wb = Workbook()

        ws1 = wb.active
        ws1.title = "Record Comparison"
        ws1.append(["Record", "Mitigation", "Prevention", "Resources", "Overall"])

        for r in self.comparison_results:
            ws1.append([r["record"], r["mitigation"],
                        r["prevention"], r["resources"], r["overall"]])

        ws2 = wb.create_sheet("Overall Averages")
        ws2.append(["Section", "Average Similarity (%)"])

        for k,v in self.overall_averages.items():
            ws2.append([k,v])

        wb.save(file_path)
        messagebox.showinfo("Success", "Excel report exported successfully!")

    # ---------------- Word Export ----------------

    def export_word_report(self):

        if not self.comparison_results:
            messagebox.showwarning("Warning", "No results to export.")
            return

        file_path = filedialog.asksaveasfilename(
            defaultextension=".docx",
            filetypes=[("Word Documents", "*.docx")]
        )

        if not file_path:
            return

        doc = Document()

        doc.add_heading("Healthcare Breach Semantic Comparison Report", level=0)
        doc.add_paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        doc.add_paragraph("="*60)

        doc.add_heading("Overall Similarity Summary", level=1)
        for section, value in self.overall_averages.items():
            doc.add_paragraph(f"{section}: {value}%", style='List Bullet')

        doc.add_page_break()

        for record in self.comparison_results:

            doc.add_heading(f"Record {record['record']}", level=1)

            doc.add_paragraph(f"Mitigation Similarity: {record['mitigation']}%")
            doc.add_paragraph(f"Prevention Similarity: {record['prevention']}%")
            doc.add_paragraph(f"Resources Similarity: {record['resources']}%")
            doc.add_paragraph(f"Overall Similarity: {record['overall']}%")

            doc.add_heading("Mitigation Insight", level=2)
            doc.add_paragraph(record["mitigation_insight"])

            doc.add_heading("Prevention Insight", level=2)
            doc.add_paragraph(record["prevention_insight"])

            doc.add_heading("Resources Insight", level=2)
            doc.add_paragraph(record["resources_insight"])

            doc.add_page_break()

        doc.save(file_path)
        messagebox.showinfo("Success", "Word report exported successfully!")


# ------------------------------
# Run App
# ------------------------------

if __name__ == "__main__":
    app = BreachComparerApp()
    app.mainloop()


In [1]:
pip install python-docx sentence-transformers ollama openpyxl

Note: you may need to restart the kernel to use updated packages.


In [None]:
#on bash ollama pull llama3
