In [18]:
import os
import itertools
import csv
import fitz  # PyMuPDF
from docx import Document
from sentence_transformers import SentenceTransformer, util
from tabulate import tabulate
import networkx as nx  # For clustering (graph-based)

# ---------- CONFIG ----------
FOLDER_PATH = r"C:\Users\HP\Desktop\New folder (2)\Check Plag"
SIMILARITY_THRESHOLD = 40  # percent
OUTPUT_CSV = "plagiarism_report.csv"

# Load Sentence-BERT model
MODEL = SentenceTransformer('all-MiniLM-L6-v2')

# ---------- FUNCTIONS ----------
def read_file(filepath):
    """Reads text from TXT, DOCX, or PDF."""
    if filepath.lower().endswith(".txt"):
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    elif filepath.lower().endswith(".docx"):
        doc = Document(filepath)
        return "\n".join([p.text for p in doc.paragraphs])
    elif filepath.lower().endswith(".pdf"):
        text = ""
        with fitz.open(filepath) as pdf:
            for page in pdf:
                text += page.get_text()
        return text
    return ""

def clean_text(text):
    """Basic cleaning: lowercase & collapse whitespace."""
    return " ".join(text.strip().lower().split())

def scan_folder(folder):
    """Get all .txt, .docx, .pdf files, skipping temp/hidden ones."""
    valid_ext = ('.txt', '.docx', '.pdf')
    files = []
    for f in os.listdir(folder):
        if f.startswith('~$') or f.startswith('.'):  # skip temp/hidden
            continue
        if f.lower().endswith(valid_ext):
            files.append(os.path.join(folder, f))
    return files

def calculate_similarity_sbert(texts):
    """Compute cosine similarity matrix using Sentence-BERT embeddings."""
    embeddings = MODEL.encode(texts, convert_to_tensor=True)
    return util.cos_sim(embeddings, embeddings).cpu().numpy()

# ---------- MAIN ----------
def main():
    files = scan_folder(FOLDER_PATH)
    if len(files) < 1:
        print(" No files found in the folder.")
        return

    # Read and clean texts
    texts = [clean_text(read_file(f)) for f in files]

    # Separate empty and non-empty files
    empty_files = [f for f, t in zip(files, texts) if not t.strip()]
    non_empty_files = [f for f, t in zip(files, texts) if t.strip()]
    non_empty_texts = [t for t in texts if t.strip()]

    results = []
    flagged = []
    G = nx.Graph()  # Graph for clustering

    # If at least 2 non-empty files exist, compute similarities
    if len(non_empty_files) > 1:
        similarity_matrix = calculate_similarity_sbert(non_empty_texts)

        for (i, f1), (j, f2) in itertools.combinations(enumerate(non_empty_files), 2):
            sim = similarity_matrix[i, j] * 100
            status = " Flagged" if sim > SIMILARITY_THRESHOLD else " Acceptable"
            results.append([os.path.basename(f1), os.path.basename(f2), f"{sim:.2f}%", status])

            if sim > SIMILARITY_THRESHOLD:
                flagged.append((f1, f2, sim))
                # Add edge in graph for clustering
                G.add_edge(os.path.basename(f1), os.path.basename(f2), weight=sim)

    # Mark empty files
    for f in empty_files:
        results.append([os.path.basename(f), "-", "Empty File", " Skipped"])
        # Add them to graph as isolated nodes
        G.add_node(os.path.basename(f))

    # Display pairwise table
    print("\n Pairwise Plagiarism Report")
    print(tabulate(results, headers=["File 1", "File 2", "Similarity", "Status"], tablefmt="grid"))

    # Save CSV
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["File 1", "File 2", "Similarity (%)", "Status"])
        for row in results:
            writer.writerow(row)

    print(f"\n Report saved to '{OUTPUT_CSV}'")

    # Display flagged pairs
    if flagged:
        print("\n Flagged pairs (above threshold):")
        for f1, f2, sim in flagged:
            print(f"  - {os.path.basename(f1)} & {os.path.basename(f2)} → {sim:.2f}%")

    # Clustering summary (connected components)
    if len(G.nodes) > 0:
        clusters = list(nx.connected_components(G))
        print("\n Clustering Summary:")
        for idx, cluster in enumerate(clusters, 1):
            if len(cluster) > 1:
                print(f"  Group {idx}: {', '.join(cluster)}")
            else:
                print(f"  Group {idx}: {', '.join(cluster)} (unique/no strong matches)")

    # Empty files summary
    if empty_files:
        print("\n Empty files skipped:")
        for f in empty_files:
            print(f"  - {os.path.basename(f)}")

if __name__ == "__main__":
    main()



 Pairwise Plagiarism Report
+-----------------+-----------------+--------------+------------+
| File 1          | File 2          | Similarity   | Status     |
| Doc 1.docx      | Doc 2.docx      | 80.45%       | Flagged    |
+-----------------+-----------------+--------------+------------+
| Doc 1.docx      | Doc 3.docx      | 86.90%       | Flagged    |
+-----------------+-----------------+--------------+------------+
| Doc 1.docx      | Doc 4.docx      | 70.99%       | Flagged    |
+-----------------+-----------------+--------------+------------+
| Doc 1.docx      | Doc 5.docx      | 88.18%       | Flagged    |
+-----------------+-----------------+--------------+------------+
| Doc 1.docx      | Text File 1.txt | 89.29%       | Flagged    |
+-----------------+-----------------+--------------+------------+
| Doc 1.docx      | Text File 2.txt | 19.94%       | Acceptable |
+-----------------+-----------------+--------------+------------+
| Doc 1.docx      | Text File 3.txt | 75.32%   

In [3]:
pip install python-docx tabulate

Defaulting to user installation because normal site-packages is not writeable
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install scikit-learn PyMuPDF python-docx tabulate

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
    --------------------------------------- 0.3/18.7 MB ? eta -:--:--
    --------------------------------------- 0.3/18.7 MB ? eta -:--:--
   - -------------------------------------- 0.5/18.7 MB 607.7 kB/s eta 0:00:30
   - -------------------------------------- 0.5/18.7 MB 607.7 kB/s eta 0:00:30
   - -------------------------------------- 0.8/18.7 MB 594.3 kB/s eta 0:00:31
   - -------------------------------------- 0.8/18.7 MB 594.3 kB/s eta 0:00:31
   -- ------------------------------------- 1.0/18.7 MB 598.0 kB/s eta 0:00:30
   -- ------------------------



In [12]:
pip install sentence-transformers PyMuPDF python-docx tabulate

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.35.1-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Downloading sentence




   ------ --------------------------------- 1/6 [torch]
   ------ --------------------------------- 1/6 [torch]
   ------ --------------------------------- 1/6 [torch]
   ------ --------------------------------- 1/6 [torch]
   ------ --------------------------------- 1/6 [torch]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingface-hub]
   ------------- -------------------------- 2/6 [huggingfac