In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# -------------------------
# Step 1: Import Libraries
# -------------------------
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns


In [None]:
# -------------------------
# Step 2: Configurations
# -------------------------
class CFG:
    seed = 42
    img_size = 128       # CNN থেকে scratch হলে ছোট size দিয়ে শুরু করা ভালো
    batch_size = 32
    lr = 1e-3
    epochs = 10
    device = "cuda" if torch.cuda.is_available() else "cpu"
    train_dir = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/train_images"
    test_dir = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/test_images"
    valid_frac = 0.15

In [None]:
# reproducibility
def set_seed(seed=CFG.seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

print("Device:", CFG.device)


In [None]:
# Step 3: Custom Dataset
# -------------------------
class ForgeryDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.images = []
        self.labels = []
        for label, cls in enumerate(["authentic","forged"]):
            cls_dir = os.path.join(root_dir, cls)
            for img_path in glob(os.path.join(cls_dir,"*")):
                self.images.append(img_path)
                self.labels.append(label)
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        img_path = self.images[idx]
        label = self.labels[idx]
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, label

In [None]:
# -------------------------
# Step 4: Transform & DataLoader
# -------------------------
train_transform = transforms.Compose([
    transforms.Resize((CFG.img_size,CFG.img_size)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

valid_transform = transforms.Compose([
    transforms.Resize((CFG.img_size,CFG.img_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

dataset = ForgeryDataset(CFG.train_dir, transform=train_transform)
total = len(dataset)
val_size = int(total*CFG.valid_frac)
train_size = total - val_size
train_ds, valid_ds = random_split(dataset, [train_size, val_size],
                                  generator=torch.Generator().manual_seed(CFG.seed))

# For validation we override transform
valid_ds.dataset.transform = valid_transform

train_loader = DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True, num_workers=2, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=CFG.batch_size, shuffle=False, num_workers=2, pin_memory=True)

print("Train size:", len(train_ds), "| Valid size:", len(valid_ds))

In [None]:
# -------------------------
# Step 5: Visualize 20 Images
# -------------------------
def show_samples(loader, n=20):
    images, labels = next(iter(loader))
    plt.figure(figsize=(15,6))
    for i in range(n):
        plt.subplot(2,10,i+1)
        img = images[i].permute(1,2,0).numpy()
        img = np.clip(img*0.229+0.485,0,1)  # denormalize approx
        plt.imshow(img)
        plt.axis('off')
        plt.title("authentic" if labels[i]==0 else "forged",fontsize=8)
    plt.show()

show_samples(train_loader)

In [None]:
# -------------------------
# Step 6: CNN Model (from scratch)
# -------------------------
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3,32,3,padding=1)
        self.conv2 = nn.Conv2d(32,64,3,padding=1)
        self.conv3 = nn.Conv2d(64,128,3,padding=1)
        self.pool = nn.MaxPool2d(2,2)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(128*(CFG.img_size//8)*(CFG.img_size//8), 128)
        self.fc2 = nn.Linear(128,1)
    def forward(self,x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

model = SimpleCNN().to(CFG.device)
print(model)


In [None]:
# -------------------------
# Step 7: Loss & Optimizer
# -------------------------
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=CFG.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)

# -------------------------

In [None]:
# Step 8: Training & Validation Functions
# -------------------------
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    losses, preds_all, targets_all = [],[],[]
    for imgs, labels in tqdm(loader, leave=False):
        imgs, labels = imgs.to(device), labels.float().unsqueeze(1).to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        preds_all.extend(torch.sigmoid(outputs).detach().cpu().numpy().flatten())
        targets_all.extend(labels.detach().cpu().numpy().flatten())
    pred_labels = [1 if p>0.5 else 0 for p in preds_all]
    acc = accuracy_score(targets_all, pred_labels)
    f1 = f1_score(targets_all, pred_labels)
    return np.mean(losses), acc, f1

def valid_epoch(model, loader, criterion, device):
    model.eval()
    losses, preds_all, targets_all = [],[],[]
    with torch.no_grad():
        for imgs, labels in tqdm(loader, leave=False):
            imgs, labels = imgs.to(device), labels.float().unsqueeze(1).to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            losses.append(loss.item())
            preds_all.extend(torch.sigmoid(outputs).cpu().numpy().flatten())
            targets_all.extend(labels.cpu().numpy().flatten())
    pred_labels = [1 if p>0.5 else 0 for p in preds_all]
    acc = accuracy_score(targets_all, pred_labels)
    f1 = f1_score(targets_all, pred_labels)
    return np.mean(losses), acc, f1, targets_all, pred_labels

In [None]:
# Step 9: Training Loop
# -------------------------
train_losses, valid_losses, train_f1s, valid_f1s = [],[],[],[]
best_f1 = 0

for epoch in range(CFG.epochs):
    print(f"\nEpoch {epoch+1}/{CFG.epochs}")
    train_loss, train_acc, train_f1 = train_epoch(model, train_loader, optimizer, criterion, CFG.device)
    valid_loss, valid_acc, valid_f1, valid_targets, valid_preds = valid_epoch(model, valid_loader, criterion, CFG.device)
    scheduler.step(valid_f1)

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_f1s.append(train_f1)
    valid_f1s.append(valid_f1)

    print(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
    print(f"Valid Loss: {valid_loss:.4f} | Acc: {valid_acc:.4f} | F1: {valid_f1:.4f}")

    if valid_f1>best_f1:
        best_f1 = valid_f1
        torch.save(model.state_dict(),"best_model_cnn.pth")
        print(">> Saved best model!")

In [None]:
# Step 10: Learning Rate Curve & Loss Curve
# -------------------------
plt.figure(figsize=(8,4))
plt.plot(train_losses,label="train_loss")
plt.plot(valid_losses,label="valid_loss")
plt.legend(); plt.title("Loss Curve")
plt.show()

plt.figure(figsize=(8,4))
plt.plot(train_f1s,label="train_f1")
plt.plot(valid_f1s,label="valid_f1")
plt.legend(); plt.title("F1 Curve")
plt.show()

In [None]:
# -------------------------
# Step 11: Confusion Matrix (Validation)
# -------------------------
cm = confusion_matrix(valid_targets, valid_preds)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.xticks([0.5,1.5], ["authentic","forged"])
plt.yticks([0.5,1.5], ["authentic","forged"])
plt.title("Confusion Matrix (Validation)")
plt.show()