unzipping

In [3]:
import zipfile
import os

# --- 1. Set your paths ---

# This is the name of the file you downloaded from the hackathon.
# (e.g., "Synergy25_dataset.zip")
zip_file_path = '/content/drive/MyDrive/fake_cifake_images.zip'

# This is the name of the folder where you want all the files to go.
# (e.t., "dataset/")
destination_folder = 'hackathon_dataset'

# --- 2. Create the destination folder if it doesn't exist ---
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)
    print(f"Created directory: {destination_folder}")

# --- 3. Unzip the file ---
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        print(f"Unzipping '{zip_file_path}'...")
        zip_ref.extractall(destination_folder)
        print(f"Successfully unzipped all files to '{destination_folder}'")

        # Optional: List the files you unzipped
        print("\nUnzipped contents:")
        print(zip_ref.namelist())

except zipfile.BadZipFile:
    print(f"Error: The file '{zip_file_path}' is not a valid zip file or is corrupted.")
except FileNotFoundError:
    print(f"Error: The file '{zip_file_path}' was not found.")
    print("Please make sure the file is in the same directory as this script, or provide the full path.")

Unzipping '/content/drive/MyDrive/fake_cifake_images.zip'...
Successfully unzipped all files to 'hackathon_dataset'

Unzipped contents:
['fake_cifake_images/995.png', 'fake_cifake_images/996.png', 'fake_cifake_images/957.png', 'fake_cifake_images/1000.png', 'fake_cifake_images/997.png', 'fake_cifake_images/998.png', 'fake_cifake_images/999.png', 'fake_cifake_images/948.png', 'fake_cifake_images/959.png', 'fake_cifake_images/969.png', 'fake_cifake_images/984.png', 'fake_cifake_images/991.png', 'fake_cifake_images/966.png', 'fake_cifake_images/955.png', 'fake_cifake_images/994.png', 'fake_cifake_images/978.png', 'fake_cifake_images/928.png', 'fake_cifake_images/953.png', 'fake_cifake_images/945.png', 'fake_cifake_images/954.png', 'fake_cifake_images/980.png', 'fake_cifake_images/939.png', 'fake_cifake_images/982.png', 'fake_cifake_images/986.png', 'fake_cifake_images/942.png', 'fake_cifake_images/929.png', 'fake_cifake_images/937.png', 'fake_cifake_images/964.png', 'fake_cifake_images/97

DATA PREPARATION

In [4]:
import os
import json
from PIL import Image
import collections

# --- Configuration ---
# This should be the path to the folder where you unzipped everything.
# It should contain the 5 subfolders ('real images', 'fake images', etc.)
BASE_DATA_DIR = 'hackathon_dataset'

# Define the paths to your folders and files
REAL_IMG_DIR = os.path.join(BASE_DATA_DIR, '/content/hackathon_dataset/real_cifake_images')
FAKE_IMG_DIR = os.path.join(BASE_DATA_DIR, '/content/hackathon_dataset/fake_cifake_images')
TEST_IMG_DIR = os.path.join(BASE_DATA_DIR, '/content/hackathon_dataset/test')
REAL_JSON_PATH = os.path.join(BASE_DATA_DIR, '/content/drive/MyDrive/real_cifake_preds.json', '/content/drive/MyDrive/real_cifake_preds.json') # Assuming file is named this
FAKE_JSON_PATH = os.path.join(BASE_DATA_DIR, '/content/drive/MyDrive/fake_cifake_preds.json', '/content/drive/MyDrive/fake_cifake_preds.json') # Assuming file is named this

print("--- Starting Dataset Verification ---")

# ==============================================================================
# CHECK 1: File Count Sanity Check
# ==============================================================================
print("\n[CHECK 1: File Count Sanity Check]")
try:
    num_real_images = len(os.listdir(REAL_IMG_DIR))
    num_fake_images = len(os.listdir(FAKE_IMG_DIR))
    num_test_images = len(os.listdir(TEST_IMG_DIR))

    print(f"Found {num_real_images} images in 'real images' folder.")
    print(f"Found {num_fake_images} images in 'fake images' folder.")
    print(f"Found {num_test_images} images in 'test image' folder.")

    if num_real_images == 1000 and num_fake_images == 1000:
        print("✅ STATUS: Correct number of training images found (1000 real, 1000 fake).")
    else:
        print("⚠️ WARNING: Image counts do not match the expected 1000/1000 split.")

except FileNotFoundError as e:
    print(f"❌ ERROR: A folder was not found. Please check your paths. Details: {e}")
    exit() # Stop the script if basic folders are missing

# ==============================================================================
# CHECK 2: The "Imperfect Model" Check (JSON Analysis)
# ==============================================================================
print("\n[CHECK 2: JSON Prediction Analysis]")
try:
    with open(REAL_JSON_PATH, 'r') as f:
        real_json_data = json.load(f)
    with open(FAKE_JSON_PATH, 'r') as f:
        fake_json_data = json.load(f)

    # Count predictions in the JSON for REAL images
    real_json_counts = collections.Counter(item['prediction'] for item in real_json_data)
    print("Proprietary model's predictions on REAL images:")
    print(f"  - Predicted 'real': {real_json_counts.get('real', 0)}")
    print(f"  - Predicted 'fake': {real_json_counts.get('fake', 0)}")

    # Count predictions in the JSON for FAKE images
    fake_json_counts = collections.Counter(item['prediction'] for item in fake_json_data)
    print("Proprietary model's predictions on FAKE images:")
    print(f"  - Predicted 'fake': {fake_json_counts.get('fake', 0)}")
    print(f"  - Predicted 'real': {fake_json_counts.get('real', 0)}")

    # --- The CRITICAL VERDICT ---
    if real_json_counts.get('fake', 0) == 0 and fake_json_counts.get('real', 0) == 0:
        print("✅ STATUS: The proprietary model is 'perfect' on the training set.")
        print("   Our task is a standard, balanced binary classification.")
    else:
        print("⚠️ STATUS: The proprietary model is 'imperfect'. It makes mistakes.")
        print("   This is an imbalanced/noisy-label problem. Our goal is to MIMIC THESE MISTAKES.")

except FileNotFoundError as e:
    print(f"❌ ERROR: A JSON file was not found. Please check your JSON file names and paths. Details: {e}")
    exit()
except json.JSONDecodeError:
    print("❌ ERROR: Could not parse a JSON file. It might be corrupted.")
    exit()


# ==============================================================================
# CHECK 3: Image Format & Integrity Check
# ==============================================================================
print("\n[CHECK 3: Image Integrity Check (testing a sample of 10 from each folder)]")
image_sizes = set()
image_modes = set()
corrupted_files = []

def check_images(directory, num_to_check=10):
    files = os.listdir(directory)
    for i, filename in enumerate(files):
        if i >= num_to_check:
            break
        try:
            with Image.open(os.path.join(directory, filename)) as img:
                image_sizes.add(img.size)
                image_modes.add(img.mode)
        except Exception as e:
            corrupted_files.append(os.path.join(directory, filename))

try:
    check_images(REAL_IMG_DIR)
    check_images(FAKE_IMG_DIR)

    print(f"Found image sizes: {image_sizes}")
    print(f"Found image modes (e.g., RGB, L): {image_modes}")

    if len(image_sizes) == 1:
        print("✅ STATUS: All tested images have a consistent size.")
    else:
        print("⚠️ WARNING: Images have varying sizes. We will need to resize them all.")

    if len(image_modes) == 1 and 'RGB' in image_modes:
        print("✅ STATUS: All tested images are in consistent 'RGB' mode.")
    else:
        print("⚠️ WARNING: Images have varying modes (e.g., Grayscale 'L') or are not RGB.")

    if not corrupted_files:
        print("✅ STATUS: No corrupted images found in the sample.")
    else:
        print(f"⚠️ WARNING: Found {len(corrupted_files)} corrupted images: {corrupted_files}")

except Exception as e:
    print(f"❌ ERROR: An unexpected error occurred during image check. Details: {e}")

print("\n--- Verification Complete ---")


--- Starting Dataset Verification ---

[CHECK 1: File Count Sanity Check]
Found 1000 images in 'real images' folder.
Found 1000 images in 'fake images' folder.
Found 500 images in 'test image' folder.
✅ STATUS: Correct number of training images found (1000 real, 1000 fake).

[CHECK 2: JSON Prediction Analysis]
Proprietary model's predictions on REAL images:
  - Predicted 'real': 976
  - Predicted 'fake': 24
Proprietary model's predictions on FAKE images:
  - Predicted 'fake': 988
  - Predicted 'real': 12
⚠️ STATUS: The proprietary model is 'imperfect'. It makes mistakes.
   This is an imbalanced/noisy-label problem. Our goal is to MIMIC THESE MISTAKES.

[CHECK 3: Image Integrity Check (testing a sample of 10 from each folder)]
Found image sizes: {(32, 32)}
Found image modes (e.g., RGB, L): {'RGB'}
✅ STATUS: All tested images have a consistent size.
✅ STATUS: All tested images are in consistent 'RGB' mode.
✅ STATUS: No corrupted images found in the sample.

--- Verification Complete ---

In [5]:
import os
import json
import pandas as pd

# --- Configuration ---
# This should be the path to the folder where you unzipped everything.
# BASE_DATA_DIR = 'hackathon_dataset' # No longer needed with absolute paths

# Define the paths to your folders and files
REAL_IMG_DIR = '/content/hackathon_dataset/real_cifake_images'
FAKE_IMG_DIR = '/content/hackathon_dataset/fake_cifake_images'
REAL_JSON_PATH = '/content/drive/MyDrive/real_cifake_preds.json'
FAKE_JSON_PATH = '/content/drive/MyDrive/fake_cifake_preds.json'

# Output file name
OUTPUT_CSV_PATH = 'master_labels.csv'

def process_data(image_dir, json_path, data_list):
    """
    Reads a JSON file and an image directory, and populates a list with
    image paths and their corresponding target labels.
    """
    print(f"Processing data from: {os.path.basename(json_path)}")

    # --- Load the JSON prediction data ---
    try:
        with open(json_path, 'r') as f:
            predictions = json.load(f)
    except FileNotFoundError:
        print(f"❌ ERROR: JSON file not found at {json_path}. Please check the path and filename.")
        return False
    except json.JSONDecodeError:
        print(f"❌ ERROR: Could not decode JSON from {json_path}. The file might be corrupted.")
        return False

    # --- Create a dictionary for quick lookup: {index: prediction} ---
    prediction_map = {item['index']: item['prediction'] for item in predictions}

    # --- Iterate through images and create the master list ---
    image_files = os.listdir(image_dir)
    for filename in image_files:
        # Assumes image filenames are like "1.jpg", "2.png", etc.
        # We extract the number to use as the index.
        try:
            # Get the base name without extension (e.g., "1") and convert to integer
            file_index = int(os.path.splitext(filename)[0])
        except ValueError:
            print(f"⚠️ Warning: Could not parse index from filename '{filename}'. Skipping.")
            continue

        if file_index in prediction_map:
            prediction_str = prediction_map[file_index]

            # Encode labels: "real" -> 0, "fake" -> 1
            target_label = 1 if prediction_str == 'fake' else 0

            # Get the full path to the image
            image_path = os.path.join(image_dir, filename)

            data_list.append({
                'image_path': image_path,
                'target_label': target_label
            })
        else:
            print(f"⚠️ Warning: No prediction found in JSON for image index {file_index} ('{filename}').")

    return True


def main():
    """Main function to run the data preparation process."""
    print("--- Starting Step 1: Data Preparation ---")

    master_data_list = []

    # Process the "real" images and their corresponding JSON predictions
    if not process_data(REAL_IMG_DIR, REAL_JSON_PATH, master_data_list):
        return # Stop if there was an error

    # Process the "fake" images and their corresponding JSON predictions
    if not process_data(FAKE_IMG_DIR, FAKE_JSON_PATH, master_data_list):
        return # Stop if there was an error

    # --- Convert the list to a pandas DataFrame ---
    if not master_data_list:
        print("❌ ERROR: No data was processed. The master list is empty. Halting.")
        return

    df = pd.DataFrame(master_data_list)

    # --- Shuffle the DataFrame to mix real and fake samples ---
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # --- Save the final DataFrame to a CSV file ---
    try:
        df.to_csv(OUTPUT_CSV_PATH, index=False)
        print(f"\n✅ Success! Created master dataset with {len(df)} entries.")
        print(f"   Saved to '{OUTPUT_CSV_PATH}'.")

        # Display the first few rows and the class distribution
        print("\n--- Dataset Preview ---")
        print(df.head())
        print("\n--- Final Label Distribution ---")
        print(df['target_label'].value_counts())

    except Exception as e:
        print(f"❌ ERROR: Could not save the CSV file. Details: {e}")


if __name__ == '__main__':
    main()




--- Starting Step 1: Data Preparation ---
Processing data from: real_cifake_preds.json
Processing data from: fake_cifake_preds.json

✅ Success! Created master dataset with 2000 entries.
   Saved to 'master_labels.csv'.

--- Dataset Preview ---
                                          image_path  target_label
0  /content/hackathon_dataset/fake_cifake_images/...             1
1  /content/hackathon_dataset/real_cifake_images/...             0
2  /content/hackathon_dataset/fake_cifake_images/...             1
3  /content/hackathon_dataset/real_cifake_images/...             0
4  /content/hackathon_dataset/fake_cifake_images/...             1

--- Final Label Distribution ---
target_label
1    1012
0     988
Name: count, dtype: int64


TRAINING v1

In [6]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

# --- Configuration ---
MASTER_CSV_PATH = 'master_labels.csv'
MODEL_SAVE_PATH = 'best_model.pth'
NUM_EPOCHS = 30
BATCH_SIZE = 64
LEARNING_RATE = 0.001
IMAGE_SIZE = 32 # Based on our verification step

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset Definition ---
class DeepfakeDataset(Dataset):
    """Custom Dataset for loading images from the master CSV file."""
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        label = int(self.dataframe.iloc[idx]['target_label'])

        # Load image
        try:
            image = Image.open(img_path).convert('RGB')
        except FileNotFoundError:
            print(f"Error: Image not found at {img_path}")
            # Return a dummy image and label if file is missing
            return torch.zeros(3, IMAGE_SIZE, IMAGE_SIZE), -1

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.float32)

# --- 2. Data Transforms and Splitting ---
# Define augmentations for the training set
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.RandomRotation(5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define transforms for the validation set (no augmentation)
val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the master CSV
try:
    df = pd.read_csv(MASTER_CSV_PATH)
except FileNotFoundError:
    print(f"❌ ERROR: '{MASTER_CSV_PATH}' not found. Please run the data preparation script first.")
    exit()

# Stratified split into training and validation sets
train_df, val_df = train_test_split(
    df,
    test_size=0.2,       # 80% training, 20% validation
    random_state=42,
    stratify=df['target_label'] # CRITICAL for maintaining label distribution
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Create Datasets and DataLoaders
train_dataset = DeepfakeDataset(train_df, transform=train_transform)
val_dataset = DeepfakeDataset(val_df, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


# --- 3. Model Definition (ResNet18) ---
model = models.resnet18(weights='IMAGENET1K_V1')

# Modify the final layer for our binary classification task
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.4),
    nn.Linear(256, 1) # Output is a single value
)

model = model.to(device)


# --- 4. Loss Function, Optimizer, Scheduler ---
criterion = nn.BCEWithLogitsLoss() # Handles the sigmoid activation internally
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.1)


# --- 5. Training Loop ---
best_val_accuracy = 0.0

for epoch in range(NUM_EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{NUM_EPOCHS} ---")

    # --- Training Phase ---
    model.train()
    running_loss = 0.0
    correct_train_preds = 0
    total_train_samples = 0

    for images, labels in tqdm(train_loader, desc="Training"):
        images, labels = images.to(device), labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        # Calculate accuracy
        preds = torch.sigmoid(outputs) > 0.5
        correct_train_preds += (preds == labels).sum().item()
        total_train_samples += labels.size(0)

    train_loss = running_loss / total_train_samples
    train_accuracy = correct_train_preds / total_train_samples

    # --- Validation Phase ---
    model.eval()
    running_val_loss = 0.0
    correct_val_preds = 0
    total_val_samples = 0

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc="Validation"):
            images, labels = images.to(device), labels.to(device).unsqueeze(1)

            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item() * images.size(0)

            preds = torch.sigmoid(outputs) > 0.5
            correct_val_preds += (preds == labels).sum().item()
            total_val_samples += labels.size(0)

    val_loss = running_val_loss / total_val_samples
    val_accuracy = correct_val_preds / total_val_samples

    print(f"Epoch {epoch+1} Summary:")
    print(f"  Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"  Valid Loss: {val_loss:.4f} | Valid Accuracy: {val_accuracy:.4f}")

    # Announce LR change manually if it happens
    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(val_accuracy)
    new_lr = optimizer.param_groups[0]['lr']
    if new_lr < old_lr:
        print(f"Learning rate reduced from {old_lr} to {new_lr}")

    # Save the best model based on validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"✅ New best model saved with validation accuracy: {best_val_accuracy:.4f}")

print("\n--- Training Complete ---")
print(f"Best validation accuracy achieved: {best_val_accuracy:.4f}")
print(f"Best model saved to '{MODEL_SAVE_PATH}'")



Using device: cuda
Training set size: 1600
Validation set size: 400
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 127MB/s]



--- Epoch 1/30 ---


Training: 100%|██████████| 25/25 [00:04<00:00,  5.90it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 18.17it/s]


Epoch 1 Summary:
  Train Loss: 0.5449 | Train Accuracy: 0.7369
  Valid Loss: 0.7272 | Valid Accuracy: 0.7000
✅ New best model saved with validation accuracy: 0.7000

--- Epoch 2/30 ---


Training: 100%|██████████| 25/25 [00:03<00:00,  7.75it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.95it/s]


Epoch 2 Summary:
  Train Loss: 0.3644 | Train Accuracy: 0.8594
  Valid Loss: 0.4144 | Valid Accuracy: 0.8200
✅ New best model saved with validation accuracy: 0.8200

--- Epoch 3/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.30it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 25.27it/s]


Epoch 3 Summary:
  Train Loss: 0.3124 | Train Accuracy: 0.8781
  Valid Loss: 0.4805 | Valid Accuracy: 0.8150

--- Epoch 4/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.27it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 20.80it/s]


Epoch 4 Summary:
  Train Loss: 0.2695 | Train Accuracy: 0.8925
  Valid Loss: 0.4720 | Valid Accuracy: 0.8425
✅ New best model saved with validation accuracy: 0.8425

--- Epoch 5/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.26it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.52it/s]


Epoch 5 Summary:
  Train Loss: 0.2052 | Train Accuracy: 0.9256
  Valid Loss: 0.5468 | Valid Accuracy: 0.8150

--- Epoch 6/30 ---


Training: 100%|██████████| 25/25 [00:03<00:00,  7.76it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 21.49it/s]


Epoch 6 Summary:
  Train Loss: 0.2309 | Train Accuracy: 0.9250
  Valid Loss: 0.3803 | Valid Accuracy: 0.8425

--- Epoch 7/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.23it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.66it/s]


Epoch 7 Summary:
  Train Loss: 0.2322 | Train Accuracy: 0.9244
  Valid Loss: 0.6239 | Valid Accuracy: 0.7350

--- Epoch 8/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.13it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.09it/s]


Epoch 8 Summary:
  Train Loss: 0.1948 | Train Accuracy: 0.9231
  Valid Loss: 0.6202 | Valid Accuracy: 0.7925
Learning rate reduced from 0.001 to 0.0001

--- Epoch 9/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.50it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.06it/s]


Epoch 9 Summary:
  Train Loss: 0.1590 | Train Accuracy: 0.9537
  Valid Loss: 0.4380 | Valid Accuracy: 0.8500
✅ New best model saved with validation accuracy: 0.8500

--- Epoch 10/30 ---


Training: 100%|██████████| 25/25 [00:03<00:00,  8.09it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 18.79it/s]


Epoch 10 Summary:
  Train Loss: 0.1108 | Train Accuracy: 0.9663
  Valid Loss: 0.4632 | Valid Accuracy: 0.8400

--- Epoch 11/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.06it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.90it/s]


Epoch 11 Summary:
  Train Loss: 0.0860 | Train Accuracy: 0.9719
  Valid Loss: 0.4872 | Valid Accuracy: 0.8400

--- Epoch 12/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  8.55it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 15.92it/s]


Epoch 12 Summary:
  Train Loss: 0.0780 | Train Accuracy: 0.9731
  Valid Loss: 0.4958 | Valid Accuracy: 0.8500

--- Epoch 13/30 ---


Training: 100%|██████████| 25/25 [00:06<00:00,  4.13it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00,  7.96it/s]


Epoch 13 Summary:
  Train Loss: 0.0645 | Train Accuracy: 0.9788
  Valid Loss: 0.4941 | Valid Accuracy: 0.8500
Learning rate reduced from 0.0001 to 1e-05

--- Epoch 14/30 ---


Training: 100%|██████████| 25/25 [00:05<00:00,  4.99it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.56it/s]


Epoch 14 Summary:
  Train Loss: 0.0570 | Train Accuracy: 0.9819
  Valid Loss: 0.4947 | Valid Accuracy: 0.8500

--- Epoch 15/30 ---


Training: 100%|██████████| 25/25 [00:03<00:00,  7.60it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.76it/s]


Epoch 15 Summary:
  Train Loss: 0.0631 | Train Accuracy: 0.9812
  Valid Loss: 0.4888 | Valid Accuracy: 0.8525
✅ New best model saved with validation accuracy: 0.8525

--- Epoch 16/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.33it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.56it/s]


Epoch 16 Summary:
  Train Loss: 0.0573 | Train Accuracy: 0.9812
  Valid Loss: 0.4788 | Valid Accuracy: 0.8625
✅ New best model saved with validation accuracy: 0.8625

--- Epoch 17/30 ---


Training: 100%|██████████| 25/25 [00:03<00:00,  8.00it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 18.42it/s]


Epoch 17 Summary:
  Train Loss: 0.0559 | Train Accuracy: 0.9788
  Valid Loss: 0.4802 | Valid Accuracy: 0.8600

--- Epoch 18/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.06it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.57it/s]


Epoch 18 Summary:
  Train Loss: 0.0426 | Train Accuracy: 0.9888
  Valid Loss: 0.4825 | Valid Accuracy: 0.8550

--- Epoch 19/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.35it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 25.32it/s]


Epoch 19 Summary:
  Train Loss: 0.0499 | Train Accuracy: 0.9844
  Valid Loss: 0.4866 | Valid Accuracy: 0.8550

--- Epoch 20/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.50it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 22.75it/s]


Epoch 20 Summary:
  Train Loss: 0.0553 | Train Accuracy: 0.9794
  Valid Loss: 0.4781 | Valid Accuracy: 0.8600
Learning rate reduced from 1e-05 to 1.0000000000000002e-06

--- Epoch 21/30 ---


Training: 100%|██████████| 25/25 [00:03<00:00,  8.04it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 19.49it/s]


Epoch 21 Summary:
  Train Loss: 0.0389 | Train Accuracy: 0.9875
  Valid Loss: 0.4856 | Valid Accuracy: 0.8550

--- Epoch 22/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  8.69it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.38it/s]


Epoch 22 Summary:
  Train Loss: 0.0518 | Train Accuracy: 0.9825
  Valid Loss: 0.4831 | Valid Accuracy: 0.8575

--- Epoch 23/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.29it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.21it/s]


Epoch 23 Summary:
  Train Loss: 0.0399 | Train Accuracy: 0.9881
  Valid Loss: 0.4841 | Valid Accuracy: 0.8600

--- Epoch 24/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.04it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.25it/s]


Epoch 24 Summary:
  Train Loss: 0.0450 | Train Accuracy: 0.9869
  Valid Loss: 0.4835 | Valid Accuracy: 0.8575
Learning rate reduced from 1.0000000000000002e-06 to 1.0000000000000002e-07

--- Epoch 25/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  8.36it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 16.05it/s]


Epoch 25 Summary:
  Train Loss: 0.0424 | Train Accuracy: 0.9856
  Valid Loss: 0.4910 | Valid Accuracy: 0.8575

--- Epoch 26/30 ---


Training: 100%|██████████| 25/25 [00:03<00:00,  7.72it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 22.36it/s]


Epoch 26 Summary:
  Train Loss: 0.0539 | Train Accuracy: 0.9825
  Valid Loss: 0.4842 | Valid Accuracy: 0.8550

--- Epoch 27/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.08it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 22.91it/s]


Epoch 27 Summary:
  Train Loss: 0.0332 | Train Accuracy: 0.9912
  Valid Loss: 0.4827 | Valid Accuracy: 0.8575

--- Epoch 28/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  9.03it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.48it/s]


Epoch 28 Summary:
  Train Loss: 0.0508 | Train Accuracy: 0.9838
  Valid Loss: 0.4878 | Valid Accuracy: 0.8550
Learning rate reduced from 1.0000000000000002e-07 to 1.0000000000000004e-08

--- Epoch 29/30 ---


Training: 100%|██████████| 25/25 [00:02<00:00,  8.59it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 17.97it/s]


Epoch 29 Summary:
  Train Loss: 0.0496 | Train Accuracy: 0.9831
  Valid Loss: 0.4936 | Valid Accuracy: 0.8500

--- Epoch 30/30 ---


Training: 100%|██████████| 25/25 [00:03<00:00,  8.09it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.96it/s]

Epoch 30 Summary:
  Train Loss: 0.0475 | Train Accuracy: 0.9862
  Valid Loss: 0.5031 | Valid Accuracy: 0.8475

--- Training Complete ---
Best validation accuracy achieved: 0.8625
Best model saved to 'best_model.pth'





v1 PREDICTION

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import os
import json
from tqdm import tqdm

# --- Configuration ---
# Update these paths if they are different in your environment
TEST_IMG_DIR = '/content/hackathon_dataset/test'
MODEL_PATH = 'best_model.pth'
OUTPUT_JSON_PATH = 'teamname_prediction.json' # IMPORTANT: Rename this with your team name

# Model and data settings (must match the training script)
IMAGE_SIZE = 32
BATCH_SIZE = 64 # Can be larger for inference

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset for Test Images ---
class TestDataset(Dataset):
    """Dataset for loading test images."""
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(root_dir) if os.path.isfile(os.path.join(root_dir, f))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        filename = self.image_files[idx]
        img_path = os.path.join(self.root_dir, filename)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Extract index from filename (e.g., "501.jpg" -> 501)
        index = int(os.path.splitext(filename)[0])
        return image, index

# --- 2. Load Model ---
print(f"Loading model from '{MODEL_PATH}'...")
# Re-create the model architecture
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.4),
    nn.Linear(256, 1)
)

try:
    # Load the saved weights
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at '{MODEL_PATH}'.")
    exit()

model = model.to(device)
model.eval() # CRITICAL: Set model to evaluation mode

# --- 3. Prepare Test Data ---
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = TestDataset(root_dir=TEST_IMG_DIR, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Found {len(test_dataset)} images in the test directory.")

# --- 4. Generate Predictions ---
predictions = []
with torch.no_grad(): # Disable gradient calculation for speed
    for images, indices in tqdm(test_loader, desc="Predicting"):
        images = images.to(device)

        outputs = model(images)

        # Apply sigmoid and threshold at 0.5 to get final predictions
        preds = (torch.sigmoid(outputs) > 0.5).squeeze().cpu().numpy().astype(int)
        indices = indices.cpu().numpy()

        for index, pred in zip(indices, preds):
            # Decode label: 1 -> "fake", 0 -> "real"
            prediction_str = "fake" if pred == 1 else "real"
            predictions.append({"index": int(index), "prediction": prediction_str})

# --- 5. Save Output JSON ---
# Sort predictions by index for a clean, ordered output file
predictions.sort(key=lambda x: x['index'])

try:
    with open(OUTPUT_JSON_PATH, 'w') as f:
        json.dump(predictions, f, indent=4)
    print(f"\n✅ Success! Predictions saved to '{OUTPUT_JSON_PATH}'")
    # Print a sample of the output
    print("\n--- Prediction Sample ---")
    print(json.dumps(predictions[:5], indent=4))
except Exception as e:
    print(f"❌ ERROR: Could not write JSON file. Details: {e}")


Using device: cpu
Loading model from 'best_model.pth'...
Found 500 images in the test directory.


Predicting: 100%|██████████| 8/8 [00:01<00:00,  6.51it/s]


✅ Success! Predictions saved to 'teamname_prediction.json'

--- Prediction Sample ---
[
    {
        "index": 1,
        "prediction": "fake"
    },
    {
        "index": 2,
        "prediction": "real"
    },
    {
        "index": 3,
        "prediction": "fake"
    },
    {
        "index": 4,
        "prediction": "fake"
    },
    {
        "index": 5,
        "prediction": "fake"
    }
]





V2 TRAINING

In [8]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

# --- Configuration ---
MASTER_CSV_PATH = 'master_labels.csv'
MODEL_SAVE_PATH = 'best_model_v2.pth' # Saving to a new file to avoid overwriting the original
NUM_EPOCHS = 30
BATCH_SIZE = 128
LEARNING_RATE = 0.001
IMAGE_SIZE = 32

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset Definition ---
class DeepfakeDataset(Dataset):
    """Custom Dataset for loading images from the master CSV file."""
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        label = int(self.dataframe.iloc[idx]['target_label'])

        try:
            image = Image.open(img_path).convert('RGB')
        except FileNotFoundError:
            print(f"Error: Image not found at {img_path}")
            return torch.zeros(3, IMAGE_SIZE, IMAGE_SIZE), -1

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.float32)

# --- 2. Data Transforms and Splitting ---
# Define augmentations for the training set with RandomErasing
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.RandomRotation(5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.2)) # TECHNIQUE 3: Stronger Augmentation
])

# Define transforms for the validation set (no augmentation)
val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the master CSV
df = pd.read_csv(MASTER_CSV_PATH)

# Stratified split into training and validation sets
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['target_label']
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Create Datasets and DataLoaders
train_dataset = DeepfakeDataset(train_df, transform=train_transform)
val_dataset = DeepfakeDataset(val_df, transform=val_transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


# --- 3. Model Definition (ResNet18) ---
model = models.resnet18(weights='IMAGENET1K_V1')

# Modify the final layer for our binary classification task
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.5), # TECHNIQUE 1: Increased Dropout
    nn.Linear(256, 1)
)

model = model.to(device)


# --- 4. Loss Function, Optimizer, Scheduler ---
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4) # TECHNIQUE 2: Added Weight Decay
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.1)


# --- 5. Training Loop ---
best_val_accuracy = 0.0

for epoch in range(NUM_EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{NUM_EPOCHS} ---")

    # --- Training Phase ---
    model.train()
    running_loss = 0.0
    correct_train_preds = 0
    total_train_samples = 0

    for images, labels in tqdm(train_loader, desc="Training"):
        images, labels = images.to(device), labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        preds = torch.sigmoid(outputs) > 0.5
        correct_train_preds += (preds == labels).sum().item()
        total_train_samples += labels.size(0)

    train_loss = running_loss / total_train_samples
    train_accuracy = correct_train_preds / total_train_samples

    # --- Validation Phase ---
    model.eval()
    running_val_loss = 0.0
    correct_val_preds = 0
    total_val_samples = 0

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc="Validation"):
            images, labels = images.to(device), labels.to(device).unsqueeze(1)

            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item() * images.size(0)

            preds = torch.sigmoid(outputs) > 0.5
            correct_val_preds += (preds == labels).sum().item()
            total_val_samples += labels.size(0)

    val_loss = running_val_loss / total_val_samples
    val_accuracy = correct_val_preds / total_val_samples

    print(f"Epoch {epoch+1} Summary:")
    print(f"  Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"  Valid Loss: {val_loss:.4f} | Valid Accuracy: {val_accuracy:.4f}")

    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(val_accuracy)
    new_lr = optimizer.param_groups[0]['lr']
    if new_lr < old_lr:
        print(f"Learning rate reduced from {old_lr} to {new_lr}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"✅ New best model saved with validation accuracy: {best_val_accuracy:.4f}")

print("\n--- Training Complete ---")
print(f"Best validation accuracy achieved: {best_val_accuracy:.4f}")
print(f"Best model saved to '{MODEL_SAVE_PATH}'")


Using device: cuda
Training set size: 1600
Validation set size: 400

--- Epoch 1/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  3.72it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.47it/s]


Epoch 1 Summary:
  Train Loss: 0.5674 | Train Accuracy: 0.7094
  Valid Loss: 1.9166 | Valid Accuracy: 0.5150
✅ New best model saved with validation accuracy: 0.5150

--- Epoch 2/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.35it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 13.90it/s]


Epoch 2 Summary:
  Train Loss: 0.3883 | Train Accuracy: 0.8456
  Valid Loss: 1.3026 | Valid Accuracy: 0.7425
✅ New best model saved with validation accuracy: 0.7425

--- Epoch 3/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.72it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 13.77it/s]


Epoch 3 Summary:
  Train Loss: 0.3116 | Train Accuracy: 0.8744
  Valid Loss: 0.4294 | Valid Accuracy: 0.8425
✅ New best model saved with validation accuracy: 0.8425

--- Epoch 4/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  3.98it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 11.75it/s]


Epoch 4 Summary:
  Train Loss: 0.2712 | Train Accuracy: 0.8950
  Valid Loss: 0.4314 | Valid Accuracy: 0.8625
✅ New best model saved with validation accuracy: 0.8625

--- Epoch 5/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.75it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.85it/s]


Epoch 5 Summary:
  Train Loss: 0.2576 | Train Accuracy: 0.8944
  Valid Loss: 0.4255 | Valid Accuracy: 0.8625

--- Epoch 6/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.86it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.61it/s]


Epoch 6 Summary:
  Train Loss: 0.2349 | Train Accuracy: 0.9087
  Valid Loss: 0.4250 | Valid Accuracy: 0.8550

--- Epoch 7/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.79it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.93it/s]


Epoch 7 Summary:
  Train Loss: 0.2097 | Train Accuracy: 0.9331
  Valid Loss: 0.4044 | Valid Accuracy: 0.8675
✅ New best model saved with validation accuracy: 0.8675

--- Epoch 8/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  4.08it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 10.74it/s]


Epoch 8 Summary:
  Train Loss: 0.2109 | Train Accuracy: 0.9163
  Valid Loss: 0.3998 | Valid Accuracy: 0.8400

--- Epoch 9/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.59it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.16it/s]


Epoch 9 Summary:
  Train Loss: 0.1406 | Train Accuracy: 0.9494
  Valid Loss: 0.4432 | Valid Accuracy: 0.8600

--- Epoch 10/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.71it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.94it/s]


Epoch 10 Summary:
  Train Loss: 0.1778 | Train Accuracy: 0.9300
  Valid Loss: 0.9059 | Valid Accuracy: 0.7250

--- Epoch 11/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.76it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.84it/s]


Epoch 11 Summary:
  Train Loss: 0.1775 | Train Accuracy: 0.9450
  Valid Loss: 0.3864 | Valid Accuracy: 0.8625
Learning rate reduced from 0.001 to 0.0001

--- Epoch 12/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  3.67it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00,  7.47it/s]


Epoch 12 Summary:
  Train Loss: 0.1167 | Train Accuracy: 0.9625
  Valid Loss: 0.3554 | Valid Accuracy: 0.8675

--- Epoch 13/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  4.09it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.62it/s]


Epoch 13 Summary:
  Train Loss: 0.0970 | Train Accuracy: 0.9681
  Valid Loss: 0.3792 | Valid Accuracy: 0.8550

--- Epoch 14/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.84it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.70it/s]


Epoch 14 Summary:
  Train Loss: 0.0778 | Train Accuracy: 0.9744
  Valid Loss: 0.4053 | Valid Accuracy: 0.8675

--- Epoch 15/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  5.22it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.40it/s]


Epoch 15 Summary:
  Train Loss: 0.0694 | Train Accuracy: 0.9788
  Valid Loss: 0.3844 | Valid Accuracy: 0.8575
Learning rate reduced from 0.0001 to 1e-05

--- Epoch 16/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  4.20it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 11.25it/s]


Epoch 16 Summary:
  Train Loss: 0.0681 | Train Accuracy: 0.9788
  Valid Loss: 0.3973 | Valid Accuracy: 0.8600

--- Epoch 17/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.41it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 15.22it/s]


Epoch 17 Summary:
  Train Loss: 0.0792 | Train Accuracy: 0.9738
  Valid Loss: 0.4031 | Valid Accuracy: 0.8600

--- Epoch 18/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.91it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.90it/s]


Epoch 18 Summary:
  Train Loss: 0.0787 | Train Accuracy: 0.9706
  Valid Loss: 0.3994 | Valid Accuracy: 0.8675

--- Epoch 19/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.88it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 15.11it/s]


Epoch 19 Summary:
  Train Loss: 0.0546 | Train Accuracy: 0.9825
  Valid Loss: 0.4077 | Valid Accuracy: 0.8675
Learning rate reduced from 1e-05 to 1.0000000000000002e-06

--- Epoch 20/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.40it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 10.79it/s]


Epoch 20 Summary:
  Train Loss: 0.0692 | Train Accuracy: 0.9781
  Valid Loss: 0.4166 | Valid Accuracy: 0.8675

--- Epoch 21/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  4.21it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 13.90it/s]


Epoch 21 Summary:
  Train Loss: 0.0589 | Train Accuracy: 0.9812
  Valid Loss: 0.4040 | Valid Accuracy: 0.8675

--- Epoch 22/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.75it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 13.87it/s]


Epoch 22 Summary:
  Train Loss: 0.0702 | Train Accuracy: 0.9762
  Valid Loss: 0.4011 | Valid Accuracy: 0.8625

--- Epoch 23/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.73it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.36it/s]


Epoch 23 Summary:
  Train Loss: 0.0782 | Train Accuracy: 0.9769
  Valid Loss: 0.4017 | Valid Accuracy: 0.8675
Learning rate reduced from 1.0000000000000002e-06 to 1.0000000000000002e-07

--- Epoch 24/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.83it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 11.46it/s]


Epoch 24 Summary:
  Train Loss: 0.0702 | Train Accuracy: 0.9788
  Valid Loss: 0.4029 | Valid Accuracy: 0.8625

--- Epoch 25/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  4.04it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 13.68it/s]


Epoch 25 Summary:
  Train Loss: 0.0653 | Train Accuracy: 0.9769
  Valid Loss: 0.3992 | Valid Accuracy: 0.8600

--- Epoch 26/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.74it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.15it/s]


Epoch 26 Summary:
  Train Loss: 0.0793 | Train Accuracy: 0.9756
  Valid Loss: 0.4013 | Valid Accuracy: 0.8625

--- Epoch 27/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.71it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 14.45it/s]


Epoch 27 Summary:
  Train Loss: 0.0788 | Train Accuracy: 0.9756
  Valid Loss: 0.4098 | Valid Accuracy: 0.8625
Learning rate reduced from 1.0000000000000002e-07 to 1.0000000000000004e-08

--- Epoch 28/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.84it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 12.79it/s]


Epoch 28 Summary:
  Train Loss: 0.0688 | Train Accuracy: 0.9769
  Valid Loss: 0.4107 | Valid Accuracy: 0.8675

--- Epoch 29/30 ---


Training: 100%|██████████| 13/13 [00:03<00:00,  3.94it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 15.08it/s]


Epoch 29 Summary:
  Train Loss: 0.0753 | Train Accuracy: 0.9738
  Valid Loss: 0.4082 | Valid Accuracy: 0.8650

--- Epoch 30/30 ---


Training: 100%|██████████| 13/13 [00:02<00:00,  4.77it/s]
Validation: 100%|██████████| 4/4 [00:00<00:00, 13.44it/s]

Epoch 30 Summary:
  Train Loss: 0.0731 | Train Accuracy: 0.9756
  Valid Loss: 0.4066 | Valid Accuracy: 0.8675

--- Training Complete ---
Best validation accuracy achieved: 0.8675
Best model saved to 'best_model_v2.pth'





V2 PREDICTION

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import os
import json
from tqdm import tqdm

# --- Configuration ---
# Update these paths if they are different in your environment
TEST_IMG_DIR = '/content/hackathon_dataset/test'
MODEL_PATH = 'best_model_v2.pth' # <-- THE ONLY CHANGE NEEDED
OUTPUT_JSON_PATH = 'teamname_prediction_v2.json' # Saving to a new file

# Model and data settings (must match the training script)
IMAGE_SIZE = 32
BATCH_SIZE = 64

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset for Test Images ---
class TestDataset(Dataset):
    """Dataset for loading test images."""
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(root_dir) if os.path.isfile(os.path.join(root_dir, f))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        filename = self.image_files[idx]
        img_path = os.path.join(self.root_dir, filename)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        index = int(os.path.splitext(filename)[0])
        return image, index

# --- 2. Load Model ---
print(f"Loading model from '{MODEL_PATH}'...")
# Re-create the model architecture to match the one we trained
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.5), # IMPORTANT: Must match the saved model's architecture
    nn.Linear(256, 1)
)

try:
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at '{MODEL_PATH}'.")
    exit()

model = model.to(device)
model.eval() # CRITICAL: Set model to evaluation mode

# --- 3. Prepare Test Data ---
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = TestDataset(root_dir=TEST_IMG_DIR, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Found {len(test_dataset)} images in the test directory.")

# --- 4. Generate Predictions ---
predictions = []
with torch.no_grad():
    for images, indices in tqdm(test_loader, desc="Predicting"):
        images = images.to(device)
        outputs = model(images)
        preds = (torch.sigmoid(outputs) > 0.5).squeeze().cpu().numpy().astype(int)
        indices = indices.cpu().numpy()

        for index, pred in zip(indices, preds):
            prediction_str = "fake" if pred == 1 else "real"
            predictions.append({"index": int(index), "prediction": prediction_str})

# --- 5. Save Output JSON ---
predictions.sort(key=lambda x: x['index'])

try:
    with open(OUTPUT_JSON_PATH, 'w') as f:
        json.dump(predictions, f, indent=4)
    print(f"\n✅ Success! Predictions saved to '{OUTPUT_JSON_PATH}'")
    print("\n--- Prediction Sample ---")
    print(json.dumps(predictions[:5], indent=4))
except Exception as e:
    print(f"❌ ERROR: Could not write JSON file. Details: {e}")


Using device: cuda
Loading model from 'best_model_v2.pth'...
Found 500 images in the test directory.


Predicting: 100%|██████████| 8/8 [00:00<00:00, 24.99it/s]


✅ Success! Predictions saved to 'teamname_prediction_v2.json'

--- Prediction Sample ---
[
    {
        "index": 1,
        "prediction": "fake"
    },
    {
        "index": 2,
        "prediction": "real"
    },
    {
        "index": 3,
        "prediction": "fake"
    },
    {
        "index": 4,
        "prediction": "fake"
    },
    {
        "index": 5,
        "prediction": "fake"
    }
]





STRATIFIED RANDOM SPLITTING /train_epoch_split.py script

In [9]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

# --- Configuration ---
MASTER_CSV_PATH = 'master_labels.csv'
MODEL_SAVE_PATH = 'best_model_v4_epoch_split.pth'
NUM_EPOCHS = 30 # Let's run for 30 epochs
BATCH_SIZE = 64
LEARNING_RATE = 0.001
IMAGE_SIZE = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 1. Custom Dataset Definition (Same as before) ---
class DeepfakeDataset(Dataset):
    """Custom Dataset for loading images from the master CSV file."""
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        label = int(self.dataframe.iloc[idx]['target_label'])

        try:
            image = Image.open(img_path).convert('RGB')
        except FileNotFoundError:
            print(f"Error: Image not found at {img_path}")
            return torch.zeros(3, IMAGE_SIZE, IMAGE_SIZE), -1

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.float32)

# --- 2. Data Transforms (Same as v2) ---
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.RandomRotation(5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.2))
])

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# --- 3. Load Master DataFrame (ONCE) ---
try:
    df_master = pd.read_csv(MASTER_CSV_PATH)
    print(f"Loaded master dataset with {len(df_master)} samples.")
except FileNotFoundError:
    print(f"❌ ERROR: '{MASTER_CSV_PATH}' not found. Please run the data preparation script first.")
    exit()


# --- 4. Model Definition (Same as v2) ---
model = models.resnet18(weights='IMAGENET1K_V1')
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.5), # Using our best 0.5 dropout
    nn.Linear(256, 1)
)
model = model.to(device)


# --- 5. Loss Function, Optimizer (Same as v2) ---
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4) # With weight decay
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.1)


# --- 6. Training Loop (with Per-Epoch Splitting) ---
best_val_accuracy = 0.0

for epoch in range(NUM_EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{NUM_EPOCHS} ---")

    # --- THIS IS YOUR STRATEGY ---
    # Re-split the data at the start of every epoch
    print(f"Creating new stratified 80/20 split for Epoch {epoch+1}...")
    train_df, val_df = train_test_split(
        df_master,
        test_size=0.2,       # 80% training, 20% validation
        random_state=epoch,  # Use epoch number as random_state to ensure a NEW split
        stratify=df_master['target_label']
    )

    # Create new Datasets and DataLoaders for this epoch
    train_dataset = DeepfakeDataset(train_df, transform=train_transform)
    val_dataset = DeepfakeDataset(val_df, transform=val_transform)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    # --- END OF NEW LOGIC ---


    # --- Training Phase ---
    model.train()
    running_loss = 0.0
    correct_train_preds = 0
    total_train_samples = 0

    for images, labels in tqdm(train_loader, desc="Training"):
        images, labels = images.to(device), labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        preds = torch.sigmoid(outputs) > 0.5
        correct_train_preds += (preds == labels).sum().item()
        total_train_samples += labels.size(0)

    train_loss = running_loss / total_train_samples
    train_accuracy = correct_train_preds / total_train_samples

    # --- Validation Phase ---
    model.eval()
    running_val_loss = 0.0
    correct_val_preds = 0
    total_val_samples = 0

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc="Validation"):
            images, labels = images.to(device), labels.to(device).unsqueeze(1)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item() * images.size(0)
            preds = torch.sigmoid(outputs) > 0.5
            correct_val_preds += (preds == labels).sum().item()
            total_val_samples += labels.size(0)

    val_loss = running_val_loss / total_val_samples
    val_accuracy = correct_val_preds / total_val_samples

    print(f"Epoch {epoch+1} Summary:")
    print(f"  Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"  Valid Loss: {val_loss:.4f} | Valid Accuracy: {val_accuracy:.4f}")

    # Update learning rate scheduler
    # Note: This is now based on a "noisy" val_accuracy
    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(val_accuracy)
    new_lr = optimizer.param_groups[0]['lr']
    if new_lr < old_lr:
        print(f"Learning rate reduced from {old_lr} to {new_lr}")

    # Save the best model
    # Note: This is now saving the model that performed best
    # on its *specific* 400-image validation set.
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"✅ New best model saved with (noisy) validation accuracy: {best_val_accuracy:.4f}")

print("\n--- Training Complete ---")
print(f"Highest validation accuracy achieved on a single epoch split: {best_val_accuracy:.4f}")
print(f"Best model saved to '{MODEL_SAVE_PATH}'")


Using device: cuda
Loaded master dataset with 2000 samples.

--- Epoch 1/30 ---
Creating new stratified 80/20 split for Epoch 1...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.11it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.00it/s]


Epoch 1 Summary:
  Train Loss: 0.5439 | Train Accuracy: 0.7238
  Valid Loss: 1.1387 | Valid Accuracy: 0.6950
✅ New best model saved with (noisy) validation accuracy: 0.6950

--- Epoch 2/30 ---
Creating new stratified 80/20 split for Epoch 2...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.73it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.84it/s]


Epoch 2 Summary:
  Train Loss: 0.4091 | Train Accuracy: 0.8256
  Valid Loss: 0.4036 | Valid Accuracy: 0.8400
✅ New best model saved with (noisy) validation accuracy: 0.8400

--- Epoch 3/30 ---
Creating new stratified 80/20 split for Epoch 3...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.56it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 22.43it/s]


Epoch 3 Summary:
  Train Loss: 0.3750 | Train Accuracy: 0.8500
  Valid Loss: 0.3330 | Valid Accuracy: 0.8800
✅ New best model saved with (noisy) validation accuracy: 0.8800

--- Epoch 4/30 ---
Creating new stratified 80/20 split for Epoch 4...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.67it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 16.92it/s]


Epoch 4 Summary:
  Train Loss: 0.3444 | Train Accuracy: 0.8519
  Valid Loss: 0.3760 | Valid Accuracy: 0.8575

--- Epoch 5/30 ---
Creating new stratified 80/20 split for Epoch 5...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.18it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.49it/s]


Epoch 5 Summary:
  Train Loss: 0.2975 | Train Accuracy: 0.8894
  Valid Loss: 0.2618 | Valid Accuracy: 0.9050
✅ New best model saved with (noisy) validation accuracy: 0.9050

--- Epoch 6/30 ---
Creating new stratified 80/20 split for Epoch 6...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.70it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 22.93it/s]


Epoch 6 Summary:
  Train Loss: 0.2768 | Train Accuracy: 0.8906
  Valid Loss: 0.2238 | Valid Accuracy: 0.8900

--- Epoch 7/30 ---
Creating new stratified 80/20 split for Epoch 7...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.47it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.15it/s]


Epoch 7 Summary:
  Train Loss: 0.2426 | Train Accuracy: 0.9181
  Valid Loss: 0.4362 | Valid Accuracy: 0.8850

--- Epoch 8/30 ---
Creating new stratified 80/20 split for Epoch 8...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.50it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 18.98it/s]


Epoch 8 Summary:
  Train Loss: 0.2844 | Train Accuracy: 0.8888
  Valid Loss: 0.2821 | Valid Accuracy: 0.8825

--- Epoch 9/30 ---
Creating new stratified 80/20 split for Epoch 9...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.38it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.66it/s]


Epoch 9 Summary:
  Train Loss: 0.2371 | Train Accuracy: 0.9163
  Valid Loss: 0.2374 | Valid Accuracy: 0.9200
✅ New best model saved with (noisy) validation accuracy: 0.9200

--- Epoch 10/30 ---
Creating new stratified 80/20 split for Epoch 10...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.52it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 22.38it/s]


Epoch 10 Summary:
  Train Loss: 0.2502 | Train Accuracy: 0.9038
  Valid Loss: 0.2498 | Valid Accuracy: 0.8900

--- Epoch 11/30 ---
Creating new stratified 80/20 split for Epoch 11...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.77it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 22.76it/s]


Epoch 11 Summary:
  Train Loss: 0.2106 | Train Accuracy: 0.9244
  Valid Loss: 0.1829 | Valid Accuracy: 0.9425
✅ New best model saved with (noisy) validation accuracy: 0.9425

--- Epoch 12/30 ---
Creating new stratified 80/20 split for Epoch 12...


Training: 100%|██████████| 25/25 [00:03<00:00,  8.27it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 19.85it/s]


Epoch 12 Summary:
  Train Loss: 0.2077 | Train Accuracy: 0.9175
  Valid Loss: 0.1995 | Valid Accuracy: 0.9250

--- Epoch 13/30 ---
Creating new stratified 80/20 split for Epoch 13...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.55it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.46it/s]


Epoch 13 Summary:
  Train Loss: 0.1908 | Train Accuracy: 0.9287
  Valid Loss: 0.1312 | Valid Accuracy: 0.9500
✅ New best model saved with (noisy) validation accuracy: 0.9500

--- Epoch 14/30 ---
Creating new stratified 80/20 split for Epoch 14...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.85it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.76it/s]


Epoch 14 Summary:
  Train Loss: 0.1610 | Train Accuracy: 0.9431
  Valid Loss: 0.1942 | Valid Accuracy: 0.9275

--- Epoch 15/30 ---
Creating new stratified 80/20 split for Epoch 15...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.87it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.34it/s]


Epoch 15 Summary:
  Train Loss: 0.1752 | Train Accuracy: 0.9331
  Valid Loss: 0.1514 | Valid Accuracy: 0.9425

--- Epoch 16/30 ---
Creating new stratified 80/20 split for Epoch 16...


Training: 100%|██████████| 25/25 [00:03<00:00,  8.33it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 18.36it/s]


Epoch 16 Summary:
  Train Loss: 0.2033 | Train Accuracy: 0.9281
  Valid Loss: 0.1782 | Valid Accuracy: 0.9350

--- Epoch 17/30 ---
Creating new stratified 80/20 split for Epoch 17...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.45it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.21it/s]


Epoch 17 Summary:
  Train Loss: 0.1854 | Train Accuracy: 0.9356
  Valid Loss: 0.1434 | Valid Accuracy: 0.9425
Learning rate reduced from 0.001 to 0.0001

--- Epoch 18/30 ---
Creating new stratified 80/20 split for Epoch 18...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.80it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.16it/s]


Epoch 18 Summary:
  Train Loss: 0.1228 | Train Accuracy: 0.9594
  Valid Loss: 0.0600 | Valid Accuracy: 0.9825
✅ New best model saved with (noisy) validation accuracy: 0.9825

--- Epoch 19/30 ---
Creating new stratified 80/20 split for Epoch 19...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.85it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.41it/s]


Epoch 19 Summary:
  Train Loss: 0.1124 | Train Accuracy: 0.9575
  Valid Loss: 0.0669 | Valid Accuracy: 0.9750

--- Epoch 20/30 ---
Creating new stratified 80/20 split for Epoch 20...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.94it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 16.91it/s]


Epoch 20 Summary:
  Train Loss: 0.0939 | Train Accuracy: 0.9706
  Valid Loss: 0.0510 | Valid Accuracy: 0.9850
✅ New best model saved with (noisy) validation accuracy: 0.9850

--- Epoch 21/30 ---
Creating new stratified 80/20 split for Epoch 21...


Training: 100%|██████████| 25/25 [00:03<00:00,  6.51it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 14.74it/s]


Epoch 21 Summary:
  Train Loss: 0.0904 | Train Accuracy: 0.9688
  Valid Loss: 0.0476 | Valid Accuracy: 0.9850

--- Epoch 22/30 ---
Creating new stratified 80/20 split for Epoch 22...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.80it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 22.97it/s]


Epoch 22 Summary:
  Train Loss: 0.0828 | Train Accuracy: 0.9744
  Valid Loss: 0.0214 | Valid Accuracy: 0.9950
✅ New best model saved with (noisy) validation accuracy: 0.9950

--- Epoch 23/30 ---
Creating new stratified 80/20 split for Epoch 23...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.92it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.89it/s]


Epoch 23 Summary:
  Train Loss: 0.0814 | Train Accuracy: 0.9712
  Valid Loss: 0.0400 | Valid Accuracy: 0.9875

--- Epoch 24/30 ---
Creating new stratified 80/20 split for Epoch 24...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.91it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.66it/s]


Epoch 24 Summary:
  Train Loss: 0.0770 | Train Accuracy: 0.9731
  Valid Loss: 0.0297 | Valid Accuracy: 0.9950

--- Epoch 25/30 ---
Creating new stratified 80/20 split for Epoch 25...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.56it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 18.05it/s]


Epoch 25 Summary:
  Train Loss: 0.0580 | Train Accuracy: 0.9831
  Valid Loss: 0.0199 | Valid Accuracy: 0.9925

--- Epoch 26/30 ---
Creating new stratified 80/20 split for Epoch 26...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.75it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.88it/s]


Epoch 26 Summary:
  Train Loss: 0.0621 | Train Accuracy: 0.9750
  Valid Loss: 0.0162 | Valid Accuracy: 0.9950
Learning rate reduced from 0.0001 to 1e-05

--- Epoch 27/30 ---
Creating new stratified 80/20 split for Epoch 27...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.88it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 24.10it/s]


Epoch 27 Summary:
  Train Loss: 0.0445 | Train Accuracy: 0.9844
  Valid Loss: 0.0091 | Valid Accuracy: 1.0000
✅ New best model saved with (noisy) validation accuracy: 1.0000

--- Epoch 28/30 ---
Creating new stratified 80/20 split for Epoch 28...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.93it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.55it/s]


Epoch 28 Summary:
  Train Loss: 0.0415 | Train Accuracy: 0.9862
  Valid Loss: 0.0246 | Valid Accuracy: 0.9900

--- Epoch 29/30 ---
Creating new stratified 80/20 split for Epoch 29...


Training: 100%|██████████| 25/25 [00:03<00:00,  7.26it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 21.85it/s]


Epoch 29 Summary:
  Train Loss: 0.0493 | Train Accuracy: 0.9806
  Valid Loss: 0.0096 | Valid Accuracy: 0.9975

--- Epoch 30/30 ---
Creating new stratified 80/20 split for Epoch 30...


Training: 100%|██████████| 25/25 [00:02<00:00,  8.92it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 23.98it/s]

Epoch 30 Summary:
  Train Loss: 0.0435 | Train Accuracy: 0.9844
  Valid Loss: 0.0161 | Valid Accuracy: 0.9975

--- Training Complete ---
Highest validation accuracy achieved on a single epoch split: 1.0000
Best model saved to 'best_model_v4_epoch_split.pth'





VERIFICATION

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import os
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# --- Configuration ---
MASTER_CSV_PATH = 'master_labels.csv'
# --- LOAD THE NEW CHAMPION MODEL ---
MODEL_PATH = 'best_model_v4_epoch_split.pth'

IMAGE_SIZE = 32
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 1. Load Model ---
print(f"Loading new champion model from '{MODEL_PATH}'...")
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.5), # Must match the saved model's architecture
    nn.Linear(256, 1)
)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model = model.to(device)
model.eval() # Set to eval mode

# --- 2. Prepare STABLE Validation Data ---
# We will use the *exact* same 80/20 split as our original v2 test
try:
    df = pd.read_csv(MASTER_CSV_PATH)
except FileNotFoundError:
    print(f"❌ ERROR: '{MASTER_CSV_PATH}' not found.")
    exit()

# Re-create the *exact* same 80/20 split using random_state=42
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42, # This is our stable, benchmark split
    stratify=df['target_label']
)
print(f"Loaded our stable validation set of {len(val_df)} images.")

# Create the custom dataset
class ValidationDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform
    def __len__(self):
        return len(self.dataframe)
    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        label = int(self.dataframe.iloc[idx]['target_label'])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_dataset = ValidationDataset(val_df, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# --- 3. Generate Predictions on the STABLE Validation Set ---
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in tqdm(val_loader, desc="Validating Champion Model"):
        images = images.to(device)
        outputs = model(images)
        preds_numeric = (torch.sigmoid(outputs) > 0.5).squeeze().cpu().numpy().astype(int)
        labels_numeric = labels.cpu().numpy()

        if preds_numeric.ndim == 0:
            preds_numeric = [preds_numeric.item()]
            labels_numeric = [labels_numeric.item()]

        all_preds.extend(preds_numeric)
        all_labels.extend(labels_numeric)

# --- 4. Calculate Final "True" Accuracy ---
final_accuracy = accuracy_score(all_labels, all_preds)

print("\n--- FINAL VALIDATION COMPLETE ---")
print(f"Original v2 Model (Stable Accuracy): 0.9000")
print(f"New v4 Model (Stable Accuracy)   : {final_accuracy:.4f}")

if final_accuracy > 0.9000:
    print("\n✅✅✅ IT'S CONFIRMED! Your strategy worked.")
    print("The new model is officially better.")
else:
    print("\n⚠️ The original v2 model remains the champion.")


Using device: cuda
Loading new champion model from 'best_model_v4_epoch_split.pth'...
Loaded our stable validation set of 400 images.


Validating Champion Model: 100%|██████████| 7/7 [00:00<00:00, 21.13it/s]


--- FINAL VALIDATION COMPLETE ---
Original v2 Model (Stable Accuracy): 0.9000
New v4 Model (Stable Accuracy)   : 0.9925

✅✅✅ IT'S CONFIRMED! Your strategy worked.
The new model is officially better.





predict_final.py script

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import os
import json
from tqdm import tqdm

# --- Configuration ---
TEST_IMG_DIR = '/content/hackathon_dataset/test'
# --- LOAD THE CHAMPION MODEL ---
MODEL_PATH = 'best_model_v4_epoch_split.pth'
OUTPUT_JSON_PATH = 'teamname_prediction_FINAL.json'

IMAGE_SIZE = 32
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset for Test Images ---
class TestDataset(Dataset):
    """Dataset for loading test images."""
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(root_dir) if os.path.isfile(os.path.join(root_dir, f))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        filename = self.image_files[idx]
        img_path = os.path.join(self.root_dir, filename)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        index = int(os.path.splitext(filename)[0])
        return image, index

# --- 2. Load Model ---
print(f"Loading final champion model from '{MODEL_PATH}'...")
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.5), # Must match the saved model's architecture
    nn.Linear(256, 1)
)

try:
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at '{MODEL_PATH}'.")
    exit()

model = model.to(device)
model.eval() # CRITICAL: Set model to evaluation mode

# --- 3. Prepare Test Data ---
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = TestDataset(root_dir=TEST_IMG_DIR, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Found {len(test_dataset)} images in the test directory.")

# --- 4. Generate Predictions ---
predictions = []
with torch.no_grad():
    for images, indices in tqdm(test_loader, desc="Generating Final Predictions"):
        images = images.to(device)
        outputs = model(images)
        preds = (torch.sigmoid(outputs) > 0.5).squeeze().cpu().numpy().astype(int)
        indices = indices.cpu().numpy()

        if preds.ndim == 0:
            preds = [preds.item()]
            indices = [indices.item()]

        for index, pred in zip(indices, preds):
            prediction_str = "fake" if pred == 1 else "real"
            predictions.append({"index": int(index), "prediction": prediction_str})

# --- 5. Save Output JSON ---
predictions.sort(key=lambda x: x['index'])

try:
    with open(OUTPUT_JSON_PATH, 'w') as f:
        json.dump(predictions, f, indent=4)
    print(f"\n✅ Success! Final submission saved to '{OUTPUT_JSON_PATH}'")
    print("\n--- Prediction Sample ---")
    print(json.dumps(predictions[:5], indent=4))
except Exception as e:
    print(f"❌ ERROR: Could not write JSON file. Details: {e}")


Using device: cuda
Loading final champion model from 'best_model_v4_epoch_split.pth'...
Found 500 images in the test directory.


Generating Final Predictions: 100%|██████████| 8/8 [00:00<00:00, 23.92it/s]


✅ Success! Final submission saved to 'teamname_prediction_FINAL.json'

--- Prediction Sample ---
[
    {
        "index": 1,
        "prediction": "fake"
    },
    {
        "index": 2,
        "prediction": "real"
    },
    {
        "index": 3,
        "prediction": "fake"
    },
    {
        "index": 4,
        "prediction": "real"
    },
    {
        "index": 5,
        "prediction": "fake"
    }
]



