In [5]:
!pip install /kaggle/input/pymupdf-wheels/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl

Processing /kaggle/input/pymupdf-wheels/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl
Installing collected packages: pymupdf
Successfully installed pymupdf-1.26.4


In [6]:
!pip install pillow



In [7]:
import os
import fitz  # PyMuPDF

def convert_pdf_folder_to_images(pdf_folder, out_folder, img_size=(256,256)):
    os.makedirs(out_folder, exist_ok=True)
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            try:
                doc = fitz.open(pdf_path)
                for i, page in enumerate(doc):
                    zoom = 200/72  # render quality
                    mat = fitz.Matrix(zoom, zoom)
                    pix = page.get_pixmap(matrix=mat)
                    
                    img_path = os.path.join(out_folder, f"{pdf_file[:-4]}_{i}.png")
                    pix.save(img_path)
                doc.close()
                print(f"✅ Converted: {pdf_file}")
            except Exception as e:
                print("❌ Failed to convert:", pdf_file, "|", e)

# Example usage
original_folders = [
    "/kaggle/input/ai-trace-finder/data/Originals/official",
    "/kaggle/input/ai-trace-finder/data/Originals/wikipedia"
]

for folder in original_folders:
    out_folder = os.path.join("/kaggle/working/Original", os.path.basename(folder))
    convert_pdf_folder_to_images(folder, out_folder)


✅ Converted: 75.pdf
✅ Converted: 23.pdf
✅ Converted: 73.pdf
✅ Converted: 41.pdf
✅ Converted: 25.pdf
✅ Converted: 31.pdf
✅ Converted: 45.pdf
✅ Converted: 98.pdf
✅ Converted: 30.pdf
✅ Converted: 49.pdf
✅ Converted: 24.pdf
✅ Converted: 63.pdf
✅ Converted: 36.pdf
✅ Converted: 15.pdf
✅ Converted: 89.pdf
✅ Converted: 5.pdf
✅ Converted: 93.pdf
✅ Converted: 56.pdf
✅ Converted: 29.pdf
✅ Converted: 8.pdf
✅ Converted: 86.pdf
✅ Converted: 14.pdf
✅ Converted: 17.pdf
✅ Converted: 27.pdf
✅ Converted: 77.pdf
✅ Converted: 20.pdf
✅ Converted: 81.pdf
✅ Converted: 58.pdf
✅ Converted: 55.pdf
✅ Converted: 39.pdf
✅ Converted: 26.pdf
✅ Converted: 79.pdf
✅ Converted: 64.pdf
✅ Converted: 95.pdf
✅ Converted: 91.pdf
✅ Converted: 38.pdf
✅ Converted: 12.pdf
✅ Converted: 9.pdf
✅ Converted: 7.pdf
✅ Converted: 47.pdf
✅ Converted: 40.pdf
✅ Converted: 66.pdf
✅ Converted: 4.pdf
✅ Converted: 74.pdf
✅ Converted: 78.pdf
✅ Converted: 54.pdf
✅ Converted: 44.pdf
✅ Converted: 19.pdf
✅ Converted: 100.pdf
✅ Converted: 6.pdf
✅ Con

In [8]:
import fitz  # PyMuPDF for PDFs
from PIL import Image, ImageSequence
import os

def convert_tampered_to_images(tampered_folder, out_folder, img_size=(256, 256)):
    """
    Converts all PDFs, TIFFs, and image files in tampered_folder (including subfolders)
    into RGB PNG images resized to img_size. Preserves folder structure.
    """
    for root, dirs, files in os.walk(tampered_folder):
        # Preserve folder structure in the output
        relative_path = os.path.relpath(root, tampered_folder)
        target_dir = os.path.join(out_folder, relative_path)
        os.makedirs(target_dir, exist_ok=True)

        for fname in files:
            fpath = os.path.join(root, fname)
            ext = fname.lower().split(".")[-1]

            try:
                # ---- Case 1: PDF using fitz ----
                if ext == "pdf":
                    doc = fitz.open(fpath)
                    for i, page in enumerate(doc):
                        pix = page.get_pixmap()
                        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                        img = img.resize(img_size)
                        img.save(os.path.join(target_dir, f"{fname[:-4]}_{i}.png"))

                # ---- Case 2: TIF / TIFF ----
                elif ext in ["tif", "tiff"]:
                    img = Image.open(fpath)
                    for i, page in enumerate(ImageSequence.Iterator(img)):
                        page = page.convert("RGB").resize(img_size)
                        page.save(os.path.join(target_dir, f"{fname[:-4]}_{i}.png"))

                # ---- Case 3: PNG / JPG / JPEG ----
                elif ext in ["png", "jpg", "jpeg"]:
                    img = Image.open(fpath).convert("RGB").resize(img_size)
                    img.save(os.path.join(target_dir, fname[:-4] + ".png"))

                else:
                    print("Skipping unsupported file:", fname)

            except Exception as e:
                print("Failed to convert:", fname, "|", e)

# Example usage
tampered_folder = "/kaggle/input/ai-trace-finder/data/tampered"
out_tampered = "/kaggle/working/Tampered"

convert_tampered_to_images(tampered_folder, out_tampered)




In [10]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Image transformations (resize, normalize, etc.)
img_size = 256
transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
])

# ---- Original Dataset ----
orig_dataset = datasets.ImageFolder(root="/kaggle/working/Original", transform=transform)
orig_loader = DataLoader(orig_dataset, batch_size=32, shuffle=True)

# ---- Tampered Dataset ----
tampered_dataset = datasets.ImageFolder(root="/kaggle/working/Tampered", transform=transform)
tampered_loader = DataLoader(tampered_dataset, batch_size=32, shuffle=True)


In [11]:
import os
import glob
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [12]:
class ImageDataset(Dataset):
    def __init__(self, paths, labels, transform=None, img_size=(256,256)):
        self.paths = paths
        self.labels = labels
        self.transform = transform
        self.img_size = img_size

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert("RGB").resize(self.img_size)
        if self.transform:
            img = self.transform(img)
        label = self.labels[idx]
        return img, label


In [13]:
# Folders
orig_folder = "/kaggle/working/Original"
tampered_folder = "/kaggle/working/Tampered"

# Get subfolders and map to labels
all_folders = sorted(os.listdir(orig_folder)) + sorted(os.listdir(tampered_folder))
label_map = {name: i for i, name in enumerate(all_folders)}

# Collect image paths and labels
all_paths = []
all_labels = []

for folder in sorted(os.listdir(orig_folder)):
    folder_path = os.path.join(orig_folder, folder)
    for f in os.listdir(folder_path):
        all_paths.append(os.path.join(folder_path, f))
        all_labels.append(label_map[folder])

for folder in sorted(os.listdir(tampered_folder)):
    folder_path = os.path.join(tampered_folder, folder)
    for f in os.listdir(folder_path):
        all_paths.append(os.path.join(folder_path, f))
        all_labels.append(label_map[folder])

# Train/test split
train_paths, test_paths, train_labels, test_labels = train_test_split(
    all_paths, all_labels, test_size=0.2, random_state=42, stratify=all_labels
)

# Transform for CNN
cnn_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

train_dataset = ImageDataset(train_paths, train_labels, transform=cnn_transform)
test_dataset = ImageDataset(test_paths, test_labels, transform=cnn_transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print("Number of classes:", len(label_map))


Number of classes: 5


In [14]:
def train_cnn(model, dataloader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(dataloader):.4f}")

def evaluate_cnn(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs = imgs.to(device)
            outputs = model(imgs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    acc = accuracy_score(all_labels, all_preds)
    print("CNN Accuracy:", acc)


In [20]:
from torch.utils.data import Dataset
from PIL import Image
import os

class ImagePathDataset(Dataset):
    def __init__(self, root_folders, label_map, transform=None, img_size=(256,256)):
        """
        root_folders: list of top-level folders, e.g., ["/kaggle/working/Original", "/kaggle/working/Tampered"]
        label_map: dict mapping top-level folder names to labels, e.g., {"Original":0, "Tampered":1}
        """
        self.paths = []
        self.labels = []
        self.transform = transform
        self.img_size = img_size

        def is_image_file(f):
            return f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))

        for root_folder in root_folders:
            top_label = label_map[os.path.basename(root_folder)]
            # Walk recursively
            for dirpath, dirnames, filenames in os.walk(root_folder):
                for f in filenames:
                    if is_image_file(f):
                        self.paths.append(os.path.join(dirpath, f))
                        self.labels.append(top_label)

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        img_path = self.paths[idx]
        img = Image.open(img_path).convert("RGB").resize(self.img_size)
        if self.transform:
            img = self.transform(img)
        label = self.labels[idx]
        return img, label


In [22]:
label_map = {"Original": 0, "Tampered": 1}

dataset = ImagePathDataset(
    root_folders=["/kaggle/working/Original", "/kaggle/working/Tampered"],
    label_map=label_map,
    transform=cnn_transform,
    img_size=(256,256)
)

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset

train_idx, test_idx = train_test_split(list(range(len(dataset))), test_size=0.2, stratify=dataset.labels, random_state=42)

train_dataset = Subset(dataset, train_idx)
test_dataset = Subset(dataset, test_idx)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [23]:
# ResNet18
resnet18_model = models.resnet18(weights=None)
resnet18_model.fc = nn.Linear(resnet18_model.fc.in_features, len(label_map))
resnet18_model = resnet18_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet18_model.parameters(), lr=1e-4)

print("Training ResNet18...")
train_cnn(resnet18_model, train_loader, criterion, optimizer, epochs=5)

print("Evaluating ResNet18...")
evaluate_cnn(resnet18_model, test_loader)


Training ResNet18...
Epoch 1, Loss: 0.4320
Epoch 2, Loss: 0.1778
Epoch 3, Loss: 0.1035
Epoch 4, Loss: 0.0308
Epoch 5, Loss: 0.0246
Evaluating ResNet18...
CNN Accuracy: 0.8446601941747572


In [24]:
def extract_features(model, dataloader):
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for imgs, lbls in dataloader:
            imgs = imgs.to(device)
            x = model.conv1(imgs)
            x = model.bn1(x)
            x = model.relu(x)
            x = model.maxpool(x)
            x = model.layer1(x)
            x = model.layer2(x)
            x = model.layer3(x)
            x = model.layer4(x)
            x = model.avgpool(x)
            x = torch.flatten(x, 1).cpu().numpy()
            features.extend(x)
            labels.extend(lbls.numpy())
    return features, labels

train_feats, train_lbls = extract_features(resnet18_model, train_loader)
test_feats, test_lbls = extract_features(resnet18_model, test_loader)


In [25]:
# SVM
svm_model = SVC(kernel='linear')
svm_model.fit(train_feats, train_lbls)
svm_preds = svm_model.predict(test_feats)
print("SVM Accuracy:", accuracy_score(test_lbls, svm_preds))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_feats, train_lbls)
rf_preds = rf_model.predict(test_feats)
print("Random Forest Accuracy:", accuracy_score(test_lbls, rf_preds))


SVM Accuracy: 1.0
Random Forest Accuracy: 0.9902912621359223


In [29]:
import joblib
joblib.dump(svm_model, "/kaggle/working/svm_model.pkl")

['/kaggle/working/svm_model.pkl']

In [30]:
# Save only weights
torch.save(resnet18_model.state_dict(), "/kaggle/working/resnet18_weights.pth")

# OR save the entire model (bigger file, includes architecture)
torch.save(resnet18_model, "/kaggle/working/resnet18_full.pth")
