In [None]:
import pandas as pd
import threading
import tkinter as tk
from tkinter import filedialog, ttk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from ollama import chat
import time
from docx import Document
from docx.shared import Pt
import random

# Globals
vectorizer = None
classifier = None
output_df = []  # List of dicts to convert into DataFrame

# Train classifier
def load_and_train_classifier(df):
    global vectorizer, classifier
    df = df.dropna(subset=["Web Description", "Type of Breach"])
    X = df["Web Description"].astype(str)
    y = df["Type of Breach"].astype(str)
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X_vec = vectorizer.fit_transform(X)
    classifier = MultinomialNB()
    classifier.fit(X_vec, y)

# Predict breach type
def predict_breach_type(desc):
    return classifier.predict(vectorizer.transform([desc]))[0]

# Ollama LLM
def generate_ollama_response(system_prompt, user_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    try:
        resp = chat(model="llama3", messages=messages)
        return resp.message.content.strip()
    except Exception as e:
        return f"Ollama Error: {e}"

# Extract cause of breach from LLM response (simple heuristic)
def extract_cause_of_breach(llm_response):
    lines = llm_response.splitlines()
    for line in lines:
        low = line.lower()
        if "cause" in low or "likely cause" in low:
            return line.strip()
    return lines[0].strip() if lines else ""

# Update UI safely from thread
def update_ui(i, entity, desc, breach, response):
    result_text.insert(tk.END, f"\n--- Record {i+1} ---\n")
    result_text.insert(tk.END, f"Covered Entity: {entity}\n")
    result_text.insert(tk.END, f"Web Description: {desc}\n")
    result_text.insert(tk.END, f"Predicted Breach Type: {breach}\n")
    result_text.insert(tk.END, f"LLM Insight (Mitigation / Prevention / Online Resources):\n{response}\n")
    result_text.insert(tk.END, "-" * 80 + "\n")
    result_text.see(tk.END)

# File processing
def process_file(path):
    global output_df
    output_df = []  # Clear old results

    try:
        df = pd.read_excel(path)
    except Exception as e:
        root.after(0, lambda: result_text.insert(tk.END, f"Failed to read file: {e}\n"))
        return

    required_cols = {"Web Description", "Type of Breach", "Name of Covered Entity"}
    if not required_cols.issubset(df.columns):
        missing = required_cols - set(df.columns)
        root.after(0, lambda: result_text.insert(tk.END, f"Missing required columns: {missing}\n"))
        return

    load_and_train_classifier(df)

    # --- Updated system prompt ---
    sys_prompt = (
        "You are a cybersecurity research assistant. "
        "For each healthcare breach description, write a structured analysis with three sections:\n"
        "1. Mitigation steps — practical actions an organization can take to reduce current impact.\n"
        "2. Prevention strategies — proactive measures to avoid similar incidents in the future.\n"
        "3. Online resources — list only freely available online cybersecurity or privacy resources "
        "(such as security blogs, industry articles, open-source tools, awareness training sites, or reputable tech forums). "
        "Avoid citing or referencing official frameworks, standards, or regulatory sites such as NIST, HIPAA, or HHS. "
        "Make the resource list appear varied and random each time. "
        "Do not summarize or quote guidelines — only refer to open online materials."
    )

    for i, row in df.iterrows():
        desc = str(row.get("Web Description", "") or "").strip()
        entity = str(row.get("Name of Covered Entity", "Unknown Entity")).strip()

        if not desc:
            continue

        breach = predict_breach_type(desc)

        variation_prompt = f"Include a few randomly chosen online sources. Random tag: {random.randint(1000, 9999)}"

        user_prompt = f"""
Covered Entity: {entity}
Predicted Breach Type: {breach}

Please provide:
1. Mitigation steps
2. Prevention strategies
3. Online resources (exclude NIST, HIPAA, HHS, and government frameworks)

{variation_prompt}
"""

        # Call the Ollama LLM to get the response
        response = generate_ollama_response(sys_prompt, user_prompt)

        # Remove random tag line if echoed by the model
        response = "\n".join(line for line in response.splitlines() if "random tag" not in line.lower())

        cause = extract_cause_of_breach(response)

        root.after(0, update_ui, i, entity, desc, breach, response)

        output_df.append({
            "Record #": i + 1,
            "Covered Entity": entity,
            "Web Description": desc,
            "Predicted Breach Type": breach,
            "Cause of Breach": cause,
            "LLM Insight": response
        })

        time.sleep(0.2)  # To avoid hammering the LLM

# Save output to Word document
def save_output():
    if not output_df:
        result_text.insert(tk.END, "\nNo data to save. Process a file first.\n")
        return

    save_path = filedialog.asksaveasfilename(defaultextension=".docx",
                                             filetypes=[("Word Documents", "*.docx")],
                                             title="Save Output As")
    if save_path:
        try:
            doc = Document()
            doc.add_heading('Breach Insight Report', level=1)

            for record in output_df:
                doc.add_heading(f"Record {record['Record #']}", level=2)

                doc.add_paragraph(f"Covered Entity: {record['Covered Entity']}", style='Normal')
                doc.add_paragraph(f"Web Description:\n{record['Web Description']}", style='Normal')
                doc.add_paragraph(f"Predicted Breach Type: {record['Predicted Breach Type']}", style='Normal')
                doc.add_paragraph(f"Cause of Breach: {record['Cause of Breach']}", style='Normal')

                doc.add_paragraph("LLM Insight (Mitigation / Prevention / Online Resources):", style='Normal')
                insight = doc.add_paragraph(record['LLM Insight'], style='Normal')
                insight.paragraph_format.space_after = Pt(12)

                doc.add_paragraph("-" * 80, style='Normal')

            doc.save(save_path)
            result_text.insert(tk.END, f"\nOutput saved to: {save_path}\n")

        except Exception as e:
            result_text.insert(tk.END, f"\nFailed to save file: {e}\n")

# File browser trigger
def browse():
    path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    if path:
        result_text.delete("1.0", tk.END)
        global output_df
        output_df = []
        threading.Thread(target=process_file, args=(path,), daemon=True).start()

# GUI Setup
root = tk.Tk()
root.title("Breach Insight (Online Resources Only)")

frame = ttk.Frame(root, padding=10)
frame.grid(row=0, column=0, sticky=(tk.W, tk.E))

ttk.Label(frame, text="Upload Excel File:").grid(column=0, row=0, sticky=tk.W)
ttk.Button(frame, text="Browse", command=browse).grid(column=1, row=0, padx=5)
ttk.Button(frame, text="Download Output", command=save_output).grid(column=2, row=0, padx=5)

text_frame = ttk.Frame(root)
text_frame.grid(row=1, column=0, padx=10, pady=10, sticky="nsew")

result_text = tk.Text(text_frame, wrap="word", width=100, height=40)
scrollbar = ttk.Scrollbar(text_frame, orient="vertical", command=result_text.yview)
result_text.configure(yscrollcommand=scrollbar.set)
result_text.grid(row=0, column=0, sticky="nsew")
scrollbar.grid(row=0, column=1, sticky="ns")

text_frame.columnconfigure(0, weight=1)
text_frame.rowconfigure(0, weight=1)
root.columnconfigure(0, weight=1)
root.rowconfigure(1, weight=1)

root.mainloop()
