In [None]:
import os
import tkinter as tk
from tkinter import filedialog, ttk, messagebox
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
import fitz  # PyMuPDF for PDFs
from docx import Document
import re
from collections import defaultdict

# -------------------------------
# Backend
# -------------------------------
def read_file(filepath):
    if filepath.lower().endswith(".txt"):
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    elif filepath.lower().endswith(".docx"):
        doc = Document(filepath)
        return "\n".join([p.text for p in doc.paragraphs])
    elif filepath.lower().endswith(".pdf"):
        text = ""
        with fitz.open(filepath) as pdf:
            for page in pdf:
                text += page.get_text()
        return text
    return ""

def clean_text(text):
    return re.sub(r"\s+", " ", text).strip()

def chunk_text(text, chunk_size=150):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

def plagiarism_check(folder_path, flag_threshold=0.2):
    model = SentenceTransformer("all-MiniLM-L6-v2")

    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
             if f.lower().endswith((".txt", ".docx", ".pdf"))]

    file_texts, empty_files = {}, []
    for file in files:
        text = clean_text(read_file(file))
        if not text.strip():
            empty_files.append(file)
        else:
            file_texts[file] = chunk_text(text)

    pairwise_results = []
    file_max_scores = defaultdict(lambda: (-1, None))  # file → (max_score, top_match)

    for f1, f2 in combinations(file_texts.keys(), 2):
        emb1 = model.encode(file_texts[f1], convert_to_tensor=False)
        emb2 = model.encode(file_texts[f2], convert_to_tensor=False)

        sim_matrix = cosine_similarity(emb1, emb2)
        avg_sim = sim_matrix.mean() * 100

        status = "❌ Flagged" if avg_sim >= (flag_threshold * 100) else "✅ Acceptable"
        pairwise_results.append([os.path.basename(f1), os.path.basename(f2), round(avg_sim, 2), status])

        # Update max similarity info
        if avg_sim > file_max_scores[f1][0]:
            file_max_scores[f1] = (avg_sim, os.path.basename(f2))
        if avg_sim > file_max_scores[f2][0]:
            file_max_scores[f2] = (avg_sim, os.path.basename(f1))

    summary_results = []
    for f in file_texts.keys():
        max_score, top_match = file_max_scores[f]
        if max_score < 0:
            max_score, top_match = 0, "-"
        status = "❌ Flagged" if max_score >= (flag_threshold * 100) else "✅ Acceptable"
        summary_results.append([os.path.basename(f), round(max_score, 2), top_match, status])

    return empty_files, pairwise_results, summary_results

# -------------------------------
# GUI
# -------------------------------
class PlagiarismApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Plagiarism Detector - Phase 1.5 (Color Coded)")

        self.folder_path = tk.StringVar()
        self.threshold = tk.DoubleVar(value=0.2)

        self.build_ui()

    def build_ui(self):
        frame = ttk.Frame(self.root, padding=10)
        frame.pack(fill="both", expand=True)

        # Folder selection
        ttk.Label(frame, text="Select Folder:").grid(row=0, column=0, sticky="w")
        ttk.Entry(frame, textvariable=self.folder_path, width=40).grid(row=0, column=1, padx=5)
        ttk.Button(frame, text="Browse", command=self.browse_folder).grid(row=0, column=2)

        # Threshold
        ttk.Label(frame, text="Flag Threshold (%):").grid(row=1, column=0, sticky="w", pady=5)
        threshold_spin = ttk.Spinbox(frame, from_=0, to=100, increment=1,
                                     textvariable=self.threshold, width=5)
        threshold_spin.grid(row=1, column=1, sticky="w")

        # Run Button
        ttk.Button(frame, text="Run Detection", command=self.run_detection).grid(row=2, column=0, columnspan=3, pady=10)

        # Pairwise Results
        ttk.Label(frame, text="Pairwise Similarities:").grid(row=3, column=0, columnspan=3, sticky="w")
        self.tree_pairs = ttk.Treeview(frame, columns=("File1", "File2", "Similarity", "Status"),
                                       show="headings", height=10)
        for col in ("File1", "File2", "Similarity", "Status"):
            self.tree_pairs.heading(col, text=col)
            self.tree_pairs.column(col, width=150, anchor="center")
        self.tree_pairs.grid(row=4, column=0, columnspan=3, sticky="nsew", pady=5)

        scrollbar1 = ttk.Scrollbar(frame, orient="vertical", command=self.tree_pairs.yview)
        self.tree_pairs.configure(yscroll=scrollbar1.set)
        scrollbar1.grid(row=4, column=3, sticky="ns")

        # Summary Results
        ttk.Label(frame, text="Per-File Summary:").grid(row=5, column=0, columnspan=3, sticky="w", pady=(10,0))
        self.tree_summary = ttk.Treeview(frame, columns=("File", "Max Similarity", "Top Match", "Status"),
                                         show="headings", height=8)
        for col in ("File", "Max Similarity", "Top Match", "Status"):
            self.tree_summary.heading(col, text=col)
            self.tree_summary.column(col, width=150, anchor="center")
        self.tree_summary.grid(row=6, column=0, columnspan=3, sticky="nsew", pady=5)

        scrollbar2 = ttk.Scrollbar(frame, orient="vertical", command=self.tree_summary.yview)
        self.tree_summary.configure(yscroll=scrollbar2.set)
        scrollbar2.grid(row=6, column=3, sticky="ns")

        # Define tag styles for coloring
        self.tree_pairs.tag_configure("flagged", foreground="red")
        self.tree_pairs.tag_configure("acceptable", foreground="green")
        self.tree_summary.tag_configure("flagged", foreground="red")
        self.tree_summary.tag_configure("acceptable", foreground="green")

        frame.rowconfigure(4, weight=1)
        frame.rowconfigure(6, weight=1)
        frame.columnconfigure(1, weight=1)

    def browse_folder(self):
        folder = filedialog.askdirectory()
        if folder:
            self.folder_path.set(folder)

    def run_detection(self):
        folder = self.folder_path.get()
        if not folder or not os.path.isdir(folder):
            messagebox.showerror("Error", "Please select a valid folder.")
            return

        self.tree_pairs.delete(*self.tree_pairs.get_children())
        self.tree_summary.delete(*self.tree_summary.get_children())

        try:
            empty_files, results, summary = plagiarism_check(folder, flag_threshold=self.threshold.get() / 100)

            if empty_files:
                messagebox.showinfo("Empty Files", "Skipped empty files:\n" + "\n".join(os.path.basename(f) for f in empty_files))

            for row in results:
                tag = "flagged" if row[3] == "❌ Flagged" else "acceptable"
                self.tree_pairs.insert("", "end", values=row, tags=(tag,))

            for row in summary:
                tag = "flagged" if row[3] == "❌ Flagged" else "acceptable"
                self.tree_summary.insert("", "end", values=row, tags=(tag,))

        except Exception as e:
            messagebox.showerror("Error", str(e))


# -------------------------------
# Run GUI
# -------------------------------
if __name__ == "__main__":
    root = tk.Tk()
    app = PlagiarismApp(root)
    root.mainloop()
