In [8]:
pip install scikit-learn PyMuPDF python-docx tabulate

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
    --------------------------------------- 0.3/18.7 MB ? eta -:--:--
    --------------------------------------- 0.3/18.7 MB ? eta -:--:--
   - -------------------------------------- 0.5/18.7 MB 607.7 kB/s eta 0:00:30
   - -------------------------------------- 0.5/18.7 MB 607.7 kB/s eta 0:00:30
   - -------------------------------------- 0.8/18.7 MB 594.3 kB/s eta 0:00:31
   - -------------------------------------- 0.8/18.7 MB 594.3 kB/s eta 0:00:31
   -- ------------------------------------- 1.0/18.7 MB 598.0 kB/s eta 0:00:30
   -- ------------------------



In [3]:
pip install python-docx tabulate

Defaulting to user installation because normal site-packages is not writeable
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
import os
import itertools
import csv
import fitz  # PyMuPDF
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

# ---------- CONFIG ----------
FOLDER_PATH = r"C:\Users\HP\Desktop\New folder (2)\Check Plag"
SIMILARITY_THRESHOLD = 20  # percent
OUTPUT_CSV = "plagiarism_report.csv"

# ---------- FUNCTIONS ----------
def read_file(filepath):
    """Reads text from TXT, DOCX, or PDF."""
    if filepath.lower().endswith(".txt"):
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    elif filepath.lower().endswith(".docx"):
        doc = Document(filepath)
        return "\n".join([p.text for p in doc.paragraphs])
    elif filepath.lower().endswith(".pdf"):
        text = ""
        with fitz.open(filepath) as pdf:
            for page in pdf:
                text += page.get_text()
        return text
    return ""

def clean_text(text):
    return " ".join(text.strip().lower().split())

def scan_folder(folder):
    """Get all .txt, .docx, and .pdf files, skipping temp/hidden files."""
    valid_ext = ('.txt', '.docx', '.pdf')
    files = []
    for f in os.listdir(folder):
        # Skip temp or hidden files
        if f.startswith('~$') or f.startswith('.'):
            continue
        if f.lower().endswith(valid_ext):
            files.append(os.path.join(folder, f))
    return files

def calculate_similarity_tfidf(texts):
    """Compute cosine similarity matrix using TF-IDF."""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return cosine_similarity(tfidf_matrix)

# ---------- MAIN ----------
def main():
    files = scan_folder(FOLDER_PATH)
    if len(files) < 2:
        print("⚠️ Need at least 2 files in the folder.")
        return

    # Read and clean texts
    texts = [clean_text(read_file(f)) for f in files]
    similarity_matrix = calculate_similarity_tfidf(texts)

    results = []
    flagged = []

    # Compare all pairs
    for (i, f1), (j, f2) in itertools.combinations(enumerate(files), 2):
        sim = similarity_matrix[i, j] * 100
        status = "❌ Flagged" if sim > SIMILARITY_THRESHOLD else "✅ Acceptable"
        results.append([os.path.basename(f1), os.path.basename(f2), f"{sim:.2f}%", status])
        if sim > SIMILARITY_THRESHOLD:
            flagged.append((f1, f2, sim))

    # Display table
    print("\nPlagiarism Report")
    print(tabulate(results, headers=["File 1", "File 2", "Similarity", "Status"], tablefmt="grid"))

    # Save CSV
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["File 1", "File 2", "Similarity (%)", "Status"])
        for row in results:
            writer.writerow(row)

    print(f"\n📄 Report saved to '{OUTPUT_CSV}'")
    if flagged:
        print("\n🚨 Flagged pairs (above threshold):")
        for f1, f2, sim in flagged:
            print(f"  - {os.path.basename(f1)} & {os.path.basename(f2)} → {sim:.2f}%")

if __name__ == "__main__":
    main()



Plagiarism Report
+------------+-----------+--------------+---------------+
| File 1     | File 2    | Similarity   | Status        |
| Doc 1.docx | file1.txt | 19.94%       | ✅ Acceptable |
+------------+-----------+--------------+---------------+
| Doc 1.docx | file2.txt | 25.49%       | ❌ Flagged    |
+------------+-----------+--------------+---------------+
| file1.txt  | file2.txt | 50.31%       | ❌ Flagged    |
+------------+-----------+--------------+---------------+

📄 Report saved to 'plagiarism_report.csv'

🚨 Flagged pairs (above threshold):
  - Doc 1.docx & file2.txt → 25.49%
  - file1.txt & file2.txt → 50.31%
