unzipping

In [3]:
import zipfile
import os

# --- 1. Set your paths ---

# This is the name of the file you downloaded from the hackathon.
# (e.g., "Synergy25_dataset.zip")
zip_file_path = '/content/drive/MyDrive/test.zip'

# This is the name of the folder where you want all the files to go.
# (e.t., "dataset/")
destination_folder = 'hackathon_dataset'

# --- 2. Create the destination folder if it doesn't exist ---
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)
    print(f"Created directory: {destination_folder}")

# --- 3. Unzip the file ---
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        print(f"Unzipping '{zip_file_path}'...")
        zip_ref.extractall(destination_folder)
        print(f"Successfully unzipped all files to '{destination_folder}'")

        # Optional: List the files you unzipped
        print("\nUnzipped contents:")
        print(zip_ref.namelist())

except zipfile.BadZipFile:
    print(f"Error: The file '{zip_file_path}' is not a valid zip file or is corrupted.")
except FileNotFoundError:
    print(f"Error: The file '{zip_file_path}' was not found.")
    print("Please make sure the file is in the same directory as this script, or provide the full path.")

Unzipping '/content/drive/MyDrive/test.zip'...
Successfully unzipped all files to 'hackathon_dataset'

Unzipped contents:
['test/493.png', 'test/482.png', 'test/487.png', 'test/479.png', 'test/453.png', 'test/489.png', 'test/483.png', 'test/459.png', 'test/486.png', 'test/496.png', 'test/488.png', 'test/468.png', 'test/474.png', 'test/471.png', 'test/454.png', 'test/498.png', 'test/480.png', 'test/481.png', 'test/450.png', 'test/466.png', 'test/460.png', 'test/500.png', 'test/473.png', 'test/467.png', 'test/497.png', 'test/476.png', 'test/484.png', 'test/463.png', 'test/449.png', 'test/465.png', 'test/485.png', 'test/461.png', 'test/478.png', 'test/472.png', 'test/469.png', 'test/464.png', 'test/499.png', 'test/462.png', 'test/470.png', 'test/451.png', 'test/456.png', 'test/448.png', 'test/452.png', 'test/457.png', 'test/458.png', 'test/455.png', 'test/491.png', 'test/475.png', 'test/495.png', 'test/492.png', 'test/442.png', 'test/494.png', 'test/490.png', 'test/391.png', 'test/441.png

EDA

In [4]:
import os
import json
from PIL import Image
import collections

# --- Configuration ---
# This should be the path to the folder where you unzipped everything.
# It should contain the 5 subfolders ('real images', 'fake images', etc.)
BASE_DATA_DIR = 'hackathon_dataset'

# Define the paths to your folders and files
REAL_IMG_DIR = os.path.join(BASE_DATA_DIR, '/content/hackathon_dataset/real_cifake_images')
FAKE_IMG_DIR = os.path.join(BASE_DATA_DIR, '/content/hackathon_dataset/fake_cifake_images')
TEST_IMG_DIR = os.path.join(BASE_DATA_DIR, '/content/hackathon_dataset/test')
REAL_JSON_PATH = os.path.join(BASE_DATA_DIR, '/content/drive/MyDrive/real_cifake_preds.json', '/content/drive/MyDrive/real_cifake_preds.json') # Assuming file is named this
FAKE_JSON_PATH = os.path.join(BASE_DATA_DIR, '/content/drive/MyDrive/fake_cifake_preds.json', '/content/drive/MyDrive/fake_cifake_preds.json') # Assuming file is named this

print("--- Starting Dataset Verification ---")

# ==============================================================================
# CHECK 1: File Count Sanity Check
# ==============================================================================
print("\n[CHECK 1: File Count Sanity Check]")
try:
    num_real_images = len(os.listdir(REAL_IMG_DIR))
    num_fake_images = len(os.listdir(FAKE_IMG_DIR))
    num_test_images = len(os.listdir(TEST_IMG_DIR))

    print(f"Found {num_real_images} images in 'real images' folder.")
    print(f"Found {num_fake_images} images in 'fake images' folder.")
    print(f"Found {num_test_images} images in 'test image' folder.")

    if num_real_images == 1000 and num_fake_images == 1000:
        print("✅ STATUS: Correct number of training images found (1000 real, 1000 fake).")
    else:
        print("⚠️ WARNING: Image counts do not match the expected 1000/1000 split.")

except FileNotFoundError as e:
    print(f"❌ ERROR: A folder was not found. Please check your paths. Details: {e}")
    exit() # Stop the script if basic folders are missing

# ==============================================================================
# CHECK 2: The "Imperfect Model" Check (JSON Analysis)
# ==============================================================================
print("\n[CHECK 2: JSON Prediction Analysis]")
try:
    with open(REAL_JSON_PATH, 'r') as f:
        real_json_data = json.load(f)
    with open(FAKE_JSON_PATH, 'r') as f:
        fake_json_data = json.load(f)

    # Count predictions in the JSON for REAL images
    real_json_counts = collections.Counter(item['prediction'] for item in real_json_data)
    print("Proprietary model's predictions on REAL images:")
    print(f"  - Predicted 'real': {real_json_counts.get('real', 0)}")
    print(f"  - Predicted 'fake': {real_json_counts.get('fake', 0)}")

    # Count predictions in the JSON for FAKE images
    fake_json_counts = collections.Counter(item['prediction'] for item in fake_json_data)
    print("Proprietary model's predictions on FAKE images:")
    print(f"  - Predicted 'fake': {fake_json_counts.get('fake', 0)}")
    print(f"  - Predicted 'real': {fake_json_counts.get('real', 0)}")

    # --- The CRITICAL VERDICT ---
    if real_json_counts.get('fake', 0) == 0 and fake_json_counts.get('real', 0) == 0:
        print("✅ STATUS: The proprietary model is 'perfect' on the training set.")
        print("   Our task is a standard, balanced binary classification.")
    else:
        print("⚠️ STATUS: The proprietary model is 'imperfect'. It makes mistakes.")
        print("   This is an imbalanced/noisy-label problem. Our goal is to MIMIC THESE MISTAKES.")

except FileNotFoundError as e:
    print(f"❌ ERROR: A JSON file was not found. Please check your JSON file names and paths. Details: {e}")
    exit()
except json.JSONDecodeError:
    print("❌ ERROR: Could not parse a JSON file. It might be corrupted.")
    exit()


# ==============================================================================
# CHECK 3: Image Format & Integrity Check
# ==============================================================================
print("\n[CHECK 3: Image Integrity Check (testing a sample of 10 from each folder)]")
image_sizes = set()
image_modes = set()
corrupted_files = []

def check_images(directory, num_to_check=10):
    files = os.listdir(directory)
    for i, filename in enumerate(files):
        if i >= num_to_check:
            break
        try:
            with Image.open(os.path.join(directory, filename)) as img:
                image_sizes.add(img.size)
                image_modes.add(img.mode)
        except Exception as e:
            corrupted_files.append(os.path.join(directory, filename))

try:
    check_images(REAL_IMG_DIR)
    check_images(FAKE_IMG_DIR)

    print(f"Found image sizes: {image_sizes}")
    print(f"Found image modes (e.g., RGB, L): {image_modes}")

    if len(image_sizes) == 1:
        print("✅ STATUS: All tested images have a consistent size.")
    else:
        print("⚠️ WARNING: Images have varying sizes. We will need to resize them all.")

    if len(image_modes) == 1 and 'RGB' in image_modes:
        print("✅ STATUS: All tested images are in consistent 'RGB' mode.")
    else:
        print("⚠️ WARNING: Images have varying modes (e.g., Grayscale 'L') or are not RGB.")

    if not corrupted_files:
        print("✅ STATUS: No corrupted images found in the sample.")
    else:
        print(f"⚠️ WARNING: Found {len(corrupted_files)} corrupted images: {corrupted_files}")

except Exception as e:
    print(f"❌ ERROR: An unexpected error occurred during image check. Details: {e}")

print("\n--- Verification Complete ---")


--- Starting Dataset Verification ---

[CHECK 1: File Count Sanity Check]
Found 1000 images in 'real images' folder.
Found 1000 images in 'fake images' folder.
Found 500 images in 'test image' folder.
✅ STATUS: Correct number of training images found (1000 real, 1000 fake).

[CHECK 2: JSON Prediction Analysis]
Proprietary model's predictions on REAL images:
  - Predicted 'real': 976
  - Predicted 'fake': 24
Proprietary model's predictions on FAKE images:
  - Predicted 'fake': 988
  - Predicted 'real': 12
⚠️ STATUS: The proprietary model is 'imperfect'. It makes mistakes.
   This is an imbalanced/noisy-label problem. Our goal is to MIMIC THESE MISTAKES.

[CHECK 3: Image Integrity Check (testing a sample of 10 from each folder)]
Found image sizes: {(32, 32)}
Found image modes (e.g., RGB, L): {'RGB'}
✅ STATUS: All tested images have a consistent size.
✅ STATUS: All tested images are in consistent 'RGB' mode.
✅ STATUS: No corrupted images found in the sample.

--- Verification Complete ---

PIPELINE

In [6]:
import os
import json
import pandas as pd

# --- Configuration ---
# This should be the path to the folder where you unzipped everything.
# BASE_DATA_DIR = 'hackathon_dataset' # No longer needed with absolute paths

# Define the paths to your folders and files
REAL_IMG_DIR = '/content/hackathon_dataset/real_cifake_images'
FAKE_IMG_DIR = '/content/hackathon_dataset/fake_cifake_images'
REAL_JSON_PATH = '/content/drive/MyDrive/real_cifake_preds.json'
FAKE_JSON_PATH = '/content/drive/MyDrive/fake_cifake_preds.json'

# Output file name
OUTPUT_CSV_PATH = 'master_labels.csv'

def process_data(image_dir, json_path, data_list):
    """
    Reads a JSON file and an image directory, and populates a list with
    image paths and their corresponding target labels.
    """
    print(f"Processing data from: {os.path.basename(json_path)}")

    # --- Load the JSON prediction data ---
    try:
        with open(json_path, 'r') as f:
            predictions = json.load(f)
    except FileNotFoundError:
        print(f"❌ ERROR: JSON file not found at {json_path}. Please check the path and filename.")
        return False
    except json.JSONDecodeError:
        print(f"❌ ERROR: Could not decode JSON from {json_path}. The file might be corrupted.")
        return False

    # --- Create a dictionary for quick lookup: {index: prediction} ---
    prediction_map = {item['index']: item['prediction'] for item in predictions}

    # --- Iterate through images and create the master list ---
    image_files = os.listdir(image_dir)
    for filename in image_files:
        # Assumes image filenames are like "1.jpg", "2.png", etc.
        # We extract the number to use as the index.
        try:
            # Get the base name without extension (e.g., "1") and convert to integer
            file_index = int(os.path.splitext(filename)[0])
        except ValueError:
            print(f"⚠️ Warning: Could not parse index from filename '{filename}'. Skipping.")
            continue

        if file_index in prediction_map:
            prediction_str = prediction_map[file_index]

            # Encode labels: "real" -> 0, "fake" -> 1
            target_label = 1 if prediction_str == 'fake' else 0

            # Get the full path to the image
            image_path = os.path.join(image_dir, filename)

            data_list.append({
                'image_path': image_path,
                'target_label': target_label
            })
        else:
            print(f"⚠️ Warning: No prediction found in JSON for image index {file_index} ('{filename}').")

    return True


def main():
    """Main function to run the data preparation process."""
    print("--- Starting Step 1: Data Preparation ---")

    master_data_list = []

    # Process the "real" images and their corresponding JSON predictions
    if not process_data(REAL_IMG_DIR, REAL_JSON_PATH, master_data_list):
        return # Stop if there was an error

    # Process the "fake" images and their corresponding JSON predictions
    if not process_data(FAKE_IMG_DIR, FAKE_JSON_PATH, master_data_list):
        return # Stop if there was an error

    # --- Convert the list to a pandas DataFrame ---
    if not master_data_list:
        print("❌ ERROR: No data was processed. The master list is empty. Halting.")
        return

    df = pd.DataFrame(master_data_list)

    # --- Shuffle the DataFrame to mix real and fake samples ---
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # --- Save the final DataFrame to a CSV file ---
    try:
        df.to_csv(OUTPUT_CSV_PATH, index=False)
        print(f"\n✅ Success! Created master dataset with {len(df)} entries.")
        print(f"   Saved to '{OUTPUT_CSV_PATH}'.")

        # Display the first few rows and the class distribution
        print("\n--- Dataset Preview ---")
        print(df.head())
        print("\n--- Final Label Distribution ---")
        print(df['target_label'].value_counts())

    except Exception as e:
        print(f"❌ ERROR: Could not save the CSV file. Details: {e}")


if __name__ == '__main__':
    main()




--- Starting Step 1: Data Preparation ---
Processing data from: real_cifake_preds.json
Processing data from: fake_cifake_preds.json

✅ Success! Created master dataset with 2000 entries.
   Saved to 'master_labels.csv'.

--- Dataset Preview ---
                                          image_path  target_label
0  /content/hackathon_dataset/fake_cifake_images/...             1
1  /content/hackathon_dataset/real_cifake_images/...             0
2  /content/hackathon_dataset/fake_cifake_images/...             1
3  /content/hackathon_dataset/real_cifake_images/...             0
4  /content/hackathon_dataset/fake_cifake_images/...             1

--- Final Label Distribution ---
target_label
1    1012
0     988
Name: count, dtype: int64


In [8]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

# --- Configuration ---
MASTER_CSV_PATH = 'master_labels.csv'
MODEL_SAVE_PATH = 'best_model.pth'
NUM_EPOCHS = 30
BATCH_SIZE = 64
LEARNING_RATE = 0.001
IMAGE_SIZE = 32 # Based on our verification step

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset Definition ---
class DeepfakeDataset(Dataset):
    """Custom Dataset for loading images from the master CSV file."""
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        label = int(self.dataframe.iloc[idx]['target_label'])

        # Load image
        try:
            image = Image.open(img_path).convert('RGB')
        except FileNotFoundError:
            print(f"Error: Image not found at {img_path}")
            # Return a dummy image and label if file is missing
            return torch.zeros(3, IMAGE_SIZE, IMAGE_SIZE), -1

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.float32)

# --- 2. Data Transforms and Splitting ---
# Define augmentations for the training set
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.RandomRotation(5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define transforms for the validation set (no augmentation)
val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the master CSV
try:
    df = pd.read_csv(MASTER_CSV_PATH)
except FileNotFoundError:
    print(f"❌ ERROR: '{MASTER_CSV_PATH}' not found. Please run the data preparation script first.")
    exit()

# Stratified split into training and validation sets
train_df, val_df = train_test_split(
    df,
    test_size=0.2,       # 80% training, 20% validation
    random_state=42,
    stratify=df['target_label'] # CRITICAL for maintaining label distribution
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Create Datasets and DataLoaders
train_dataset = DeepfakeDataset(train_df, transform=train_transform)
val_dataset = DeepfakeDataset(val_df, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


# --- 3. Model Definition (ResNet18) ---
model = models.resnet18(weights='IMAGENET1K_V1')

# Modify the final layer for our binary classification task
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.4),
    nn.Linear(256, 1) # Output is a single value
)

model = model.to(device)


# --- 4. Loss Function, Optimizer, Scheduler ---
criterion = nn.BCEWithLogitsLoss() # Handles the sigmoid activation internally
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.1)


# --- 5. Training Loop ---
best_val_accuracy = 0.0

for epoch in range(NUM_EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{NUM_EPOCHS} ---")

    # --- Training Phase ---
    model.train()
    running_loss = 0.0
    correct_train_preds = 0
    total_train_samples = 0

    for images, labels in tqdm(train_loader, desc="Training"):
        images, labels = images.to(device), labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        # Calculate accuracy
        preds = torch.sigmoid(outputs) > 0.5
        correct_train_preds += (preds == labels).sum().item()
        total_train_samples += labels.size(0)

    train_loss = running_loss / total_train_samples
    train_accuracy = correct_train_preds / total_train_samples

    # --- Validation Phase ---
    model.eval()
    running_val_loss = 0.0
    correct_val_preds = 0
    total_val_samples = 0

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc="Validation"):
            images, labels = images.to(device), labels.to(device).unsqueeze(1)

            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item() * images.size(0)

            preds = torch.sigmoid(outputs) > 0.5
            correct_val_preds += (preds == labels).sum().item()
            total_val_samples += labels.size(0)

    val_loss = running_val_loss / total_val_samples
    val_accuracy = correct_val_preds / total_val_samples

    print(f"Epoch {epoch+1} Summary:")
    print(f"  Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"  Valid Loss: {val_loss:.4f} | Valid Accuracy: {val_accuracy:.4f}")

    # Announce LR change manually if it happens
    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(val_accuracy)
    new_lr = optimizer.param_groups[0]['lr']
    if new_lr < old_lr:
        print(f"Learning rate reduced from {old_lr} to {new_lr}")

    # Save the best model based on validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"✅ New best model saved with validation accuracy: {best_val_accuracy:.4f}")

print("\n--- Training Complete ---")
print(f"Best validation accuracy achieved: {best_val_accuracy:.4f}")
print(f"Best model saved to '{MODEL_SAVE_PATH}'")



Using device: cpu
Training set size: 1600
Validation set size: 400

--- Epoch 1/30 ---


Training: 100%|██████████| 25/25 [00:22<00:00,  1.12it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.58it/s]


Epoch 1 Summary:
  Train Loss: 0.5283 | Train Accuracy: 0.7500
  Valid Loss: 0.8017 | Valid Accuracy: 0.7275
✅ New best model saved with validation accuracy: 0.7275

--- Epoch 2/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  5.46it/s]


Epoch 2 Summary:
  Train Loss: 0.3482 | Train Accuracy: 0.8600
  Valid Loss: 0.4447 | Valid Accuracy: 0.8375
✅ New best model saved with validation accuracy: 0.8375

--- Epoch 3/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.46it/s]


Epoch 3 Summary:
  Train Loss: 0.3010 | Train Accuracy: 0.8900
  Valid Loss: 0.4851 | Valid Accuracy: 0.8175

--- Epoch 4/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.24it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.62it/s]


Epoch 4 Summary:
  Train Loss: 0.2550 | Train Accuracy: 0.9062
  Valid Loss: 0.5031 | Valid Accuracy: 0.8625
✅ New best model saved with validation accuracy: 0.8625

--- Epoch 5/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.16it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.63it/s]


Epoch 5 Summary:
  Train Loss: 0.2207 | Train Accuracy: 0.9094
  Valid Loss: 0.4531 | Valid Accuracy: 0.8500

--- Epoch 6/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.65it/s]


Epoch 6 Summary:
  Train Loss: 0.2245 | Train Accuracy: 0.9237
  Valid Loss: 0.4137 | Valid Accuracy: 0.8375

--- Epoch 7/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.20it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  3.84it/s]


Epoch 7 Summary:
  Train Loss: 0.2178 | Train Accuracy: 0.9175
  Valid Loss: 0.4627 | Valid Accuracy: 0.8275

--- Epoch 8/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.48it/s]


Epoch 8 Summary:
  Train Loss: 0.1714 | Train Accuracy: 0.9375
  Valid Loss: 0.4884 | Valid Accuracy: 0.8550
Learning rate reduced from 0.001 to 0.0001

--- Epoch 9/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  3.64it/s]


Epoch 9 Summary:
  Train Loss: 0.1414 | Train Accuracy: 0.9519
  Valid Loss: 0.4771 | Valid Accuracy: 0.8450

--- Epoch 10/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.21it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.63it/s]


Epoch 10 Summary:
  Train Loss: 0.0963 | Train Accuracy: 0.9681
  Valid Loss: 0.4608 | Valid Accuracy: 0.8550

--- Epoch 11/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.52it/s]


Epoch 11 Summary:
  Train Loss: 0.0762 | Train Accuracy: 0.9738
  Valid Loss: 0.4601 | Valid Accuracy: 0.8650
✅ New best model saved with validation accuracy: 0.8650

--- Epoch 12/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.59it/s]


Epoch 12 Summary:
  Train Loss: 0.0747 | Train Accuracy: 0.9731
  Valid Loss: 0.4915 | Valid Accuracy: 0.8475

--- Epoch 13/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.68it/s]


Epoch 13 Summary:
  Train Loss: 0.0562 | Train Accuracy: 0.9831
  Valid Loss: 0.4965 | Valid Accuracy: 0.8550

--- Epoch 14/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.22it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  3.55it/s]


Epoch 14 Summary:
  Train Loss: 0.0559 | Train Accuracy: 0.9806
  Valid Loss: 0.5094 | Valid Accuracy: 0.8625

--- Epoch 15/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.58it/s]


Epoch 15 Summary:
  Train Loss: 0.0392 | Train Accuracy: 0.9856
  Valid Loss: 0.5127 | Valid Accuracy: 0.8700
✅ New best model saved with validation accuracy: 0.8700

--- Epoch 16/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.31it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  4.29it/s]


Epoch 16 Summary:
  Train Loss: 0.0456 | Train Accuracy: 0.9844
  Valid Loss: 0.5680 | Valid Accuracy: 0.8575

--- Epoch 17/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.20it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.62it/s]


Epoch 17 Summary:
  Train Loss: 0.0242 | Train Accuracy: 0.9925
  Valid Loss: 0.5692 | Valid Accuracy: 0.8500

--- Epoch 18/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.57it/s]


Epoch 18 Summary:
  Train Loss: 0.0435 | Train Accuracy: 0.9862
  Valid Loss: 0.6004 | Valid Accuracy: 0.8575

--- Epoch 19/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]
Validation:  57%|█████▋    | 4/7 [00:00<00:00,  5.82it/s]


KeyboardInterrupt: 

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import os
import json
from tqdm import tqdm

# --- Configuration ---
# Update these paths if they are different in your environment
TEST_IMG_DIR = '/content/hackathon_dataset/test'
MODEL_PATH = 'best_model.pth'
OUTPUT_JSON_PATH = 'teamname_prediction.json' # IMPORTANT: Rename this with your team name

# Model and data settings (must match the training script)
IMAGE_SIZE = 32
BATCH_SIZE = 64 # Can be larger for inference

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset for Test Images ---
class TestDataset(Dataset):
    """Dataset for loading test images."""
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(root_dir) if os.path.isfile(os.path.join(root_dir, f))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        filename = self.image_files[idx]
        img_path = os.path.join(self.root_dir, filename)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Extract index from filename (e.g., "501.jpg" -> 501)
        index = int(os.path.splitext(filename)[0])
        return image, index

# --- 2. Load Model ---
print(f"Loading model from '{MODEL_PATH}'...")
# Re-create the model architecture
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.4),
    nn.Linear(256, 1)
)

try:
    # Load the saved weights
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at '{MODEL_PATH}'.")
    exit()

model = model.to(device)
model.eval() # CRITICAL: Set model to evaluation mode

# --- 3. Prepare Test Data ---
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = TestDataset(root_dir=TEST_IMG_DIR, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Found {len(test_dataset)} images in the test directory.")

# --- 4. Generate Predictions ---
predictions = []
with torch.no_grad(): # Disable gradient calculation for speed
    for images, indices in tqdm(test_loader, desc="Predicting"):
        images = images.to(device)

        outputs = model(images)

        # Apply sigmoid and threshold at 0.5 to get final predictions
        preds = (torch.sigmoid(outputs) > 0.5).squeeze().cpu().numpy().astype(int)
        indices = indices.cpu().numpy()

        for index, pred in zip(indices, preds):
            # Decode label: 1 -> "fake", 0 -> "real"
            prediction_str = "fake" if pred == 1 else "real"
            predictions.append({"index": int(index), "prediction": prediction_str})

# --- 5. Save Output JSON ---
# Sort predictions by index for a clean, ordered output file
predictions.sort(key=lambda x: x['index'])

try:
    with open(OUTPUT_JSON_PATH, 'w') as f:
        json.dump(predictions, f, indent=4)
    print(f"\n✅ Success! Predictions saved to '{OUTPUT_JSON_PATH}'")
    # Print a sample of the output
    print("\n--- Prediction Sample ---")
    print(json.dumps(predictions[:5], indent=4))
except Exception as e:
    print(f"❌ ERROR: Could not write JSON file. Details: {e}")


Using device: cpu
Loading model from 'best_model.pth'...
Found 500 images in the test directory.


Predicting: 100%|██████████| 8/8 [00:01<00:00,  6.51it/s]


✅ Success! Predictions saved to 'teamname_prediction.json'

--- Prediction Sample ---
[
    {
        "index": 1,
        "prediction": "fake"
    },
    {
        "index": 2,
        "prediction": "real"
    },
    {
        "index": 3,
        "prediction": "fake"
    },
    {
        "index": 4,
        "prediction": "fake"
    },
    {
        "index": 5,
        "prediction": "fake"
    }
]





BETTER TRAINING

In [10]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

# --- Configuration ---
MASTER_CSV_PATH = 'master_labels.csv'
MODEL_SAVE_PATH = 'best_model_v2.pth' # Saving to a new file to avoid overwriting the original
NUM_EPOCHS = 30
BATCH_SIZE = 64
LEARNING_RATE = 0.001
IMAGE_SIZE = 32

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset Definition ---
class DeepfakeDataset(Dataset):
    """Custom Dataset for loading images from the master CSV file."""
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        label = int(self.dataframe.iloc[idx]['target_label'])

        try:
            image = Image.open(img_path).convert('RGB')
        except FileNotFoundError:
            print(f"Error: Image not found at {img_path}")
            return torch.zeros(3, IMAGE_SIZE, IMAGE_SIZE), -1

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.float32)

# --- 2. Data Transforms and Splitting ---
# Define augmentations for the training set with RandomErasing
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.RandomRotation(5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.2)) # TECHNIQUE 3: Stronger Augmentation
])

# Define transforms for the validation set (no augmentation)
val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the master CSV
df = pd.read_csv(MASTER_CSV_PATH)

# Stratified split into training and validation sets
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['target_label']
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Create Datasets and DataLoaders
train_dataset = DeepfakeDataset(train_df, transform=train_transform)
val_dataset = DeepfakeDataset(val_df, transform=val_transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


# --- 3. Model Definition (ResNet18) ---
model = models.resnet18(weights='IMAGENET1K_V1')

# Modify the final layer for our binary classification task
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.5), # TECHNIQUE 1: Increased Dropout
    nn.Linear(256, 1)
)

model = model.to(device)


# --- 4. Loss Function, Optimizer, Scheduler ---
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4) # TECHNIQUE 2: Added Weight Decay
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.1)


# --- 5. Training Loop ---
best_val_accuracy = 0.0

for epoch in range(NUM_EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{NUM_EPOCHS} ---")

    # --- Training Phase ---
    model.train()
    running_loss = 0.0
    correct_train_preds = 0
    total_train_samples = 0

    for images, labels in tqdm(train_loader, desc="Training"):
        images, labels = images.to(device), labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        preds = torch.sigmoid(outputs) > 0.5
        correct_train_preds += (preds == labels).sum().item()
        total_train_samples += labels.size(0)

    train_loss = running_loss / total_train_samples
    train_accuracy = correct_train_preds / total_train_samples

    # --- Validation Phase ---
    model.eval()
    running_val_loss = 0.0
    correct_val_preds = 0
    total_val_samples = 0

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc="Validation"):
            images, labels = images.to(device), labels.to(device).unsqueeze(1)

            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item() * images.size(0)

            preds = torch.sigmoid(outputs) > 0.5
            correct_val_preds += (preds == labels).sum().item()
            total_val_samples += labels.size(0)

    val_loss = running_val_loss / total_val_samples
    val_accuracy = correct_val_preds / total_val_samples

    print(f"Epoch {epoch+1} Summary:")
    print(f"  Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"  Valid Loss: {val_loss:.4f} | Valid Accuracy: {val_accuracy:.4f}")

    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(val_accuracy)
    new_lr = optimizer.param_groups[0]['lr']
    if new_lr < old_lr:
        print(f"Learning rate reduced from {old_lr} to {new_lr}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"✅ New best model saved with validation accuracy: {best_val_accuracy:.4f}")

print("\n--- Training Complete ---")
print(f"Best validation accuracy achieved: {best_val_accuracy:.4f}")
print(f"Best model saved to '{MODEL_SAVE_PATH}'")


Using device: cpu
Training set size: 1600
Validation set size: 400

--- Epoch 1/30 ---


Training: 100%|██████████| 25/25 [00:28<00:00,  1.14s/it]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.26it/s]


Epoch 1 Summary:
  Train Loss: 0.5804 | Train Accuracy: 0.7150
  Valid Loss: 0.9034 | Valid Accuracy: 0.6750
✅ New best model saved with validation accuracy: 0.6750

--- Epoch 2/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.16it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  4.84it/s]


Epoch 2 Summary:
  Train Loss: 0.4136 | Train Accuracy: 0.8313
  Valid Loss: 0.4148 | Valid Accuracy: 0.8525
✅ New best model saved with validation accuracy: 0.8525

--- Epoch 3/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.20it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.41it/s]


Epoch 3 Summary:
  Train Loss: 0.3411 | Train Accuracy: 0.8694
  Valid Loss: 0.4791 | Valid Accuracy: 0.8100

--- Epoch 4/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.18it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  4.51it/s]


Epoch 4 Summary:
  Train Loss: 0.2832 | Train Accuracy: 0.8906
  Valid Loss: 0.4779 | Valid Accuracy: 0.8325

--- Epoch 5/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.22it/s]


Epoch 5 Summary:
  Train Loss: 0.3115 | Train Accuracy: 0.8675
  Valid Loss: 0.5542 | Valid Accuracy: 0.8025

--- Epoch 6/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.25it/s]
Validation: 100%|██████████| 7/7 [00:02<00:00,  3.29it/s]


Epoch 6 Summary:
  Train Loss: 0.2539 | Train Accuracy: 0.8962
  Valid Loss: 0.5614 | Valid Accuracy: 0.7925
Learning rate reduced from 0.001 to 0.0001

--- Epoch 7/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.24it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.21it/s]


Epoch 7 Summary:
  Train Loss: 0.2314 | Train Accuracy: 0.9175
  Valid Loss: 0.4048 | Valid Accuracy: 0.8625
✅ New best model saved with validation accuracy: 0.8625

--- Epoch 8/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.27it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  4.77it/s]


Epoch 8 Summary:
  Train Loss: 0.1745 | Train Accuracy: 0.9387
  Valid Loss: 0.4040 | Valid Accuracy: 0.8700
✅ New best model saved with validation accuracy: 0.8700

--- Epoch 9/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.16it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.22it/s]


Epoch 9 Summary:
  Train Loss: 0.1593 | Train Accuracy: 0.9513
  Valid Loss: 0.4073 | Valid Accuracy: 0.8850
✅ New best model saved with validation accuracy: 0.8850

--- Epoch 10/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.25it/s]


Epoch 10 Summary:
  Train Loss: 0.1430 | Train Accuracy: 0.9487
  Valid Loss: 0.4002 | Valid Accuracy: 0.8825

--- Epoch 11/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.16it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.12it/s]


Epoch 11 Summary:
  Train Loss: 0.1322 | Train Accuracy: 0.9525
  Valid Loss: 0.4063 | Valid Accuracy: 0.8775

--- Epoch 12/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.18it/s]


Epoch 12 Summary:
  Train Loss: 0.1388 | Train Accuracy: 0.9481
  Valid Loss: 0.4103 | Valid Accuracy: 0.8925
✅ New best model saved with validation accuracy: 0.8925

--- Epoch 13/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.18it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  4.34it/s]


Epoch 13 Summary:
  Train Loss: 0.1155 | Train Accuracy: 0.9600
  Valid Loss: 0.4096 | Valid Accuracy: 0.8850

--- Epoch 14/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.25it/s]


Epoch 14 Summary:
  Train Loss: 0.1171 | Train Accuracy: 0.9650
  Valid Loss: 0.4021 | Valid Accuracy: 0.8975
✅ New best model saved with validation accuracy: 0.8975

--- Epoch 15/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.26it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  3.61it/s]


Epoch 15 Summary:
  Train Loss: 0.1235 | Train Accuracy: 0.9563
  Valid Loss: 0.4355 | Valid Accuracy: 0.8775

--- Epoch 16/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.20it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.26it/s]


Epoch 16 Summary:
  Train Loss: 0.1118 | Train Accuracy: 0.9600
  Valid Loss: 0.4416 | Valid Accuracy: 0.8850

--- Epoch 17/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  5.25it/s]


Epoch 17 Summary:
  Train Loss: 0.1026 | Train Accuracy: 0.9675
  Valid Loss: 0.4494 | Valid Accuracy: 0.8825

--- Epoch 18/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.39it/s]


Epoch 18 Summary:
  Train Loss: 0.0925 | Train Accuracy: 0.9694
  Valid Loss: 0.4626 | Valid Accuracy: 0.8750
Learning rate reduced from 0.0001 to 1e-05

--- Epoch 19/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.29it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.19it/s]


Epoch 19 Summary:
  Train Loss: 0.0907 | Train Accuracy: 0.9644
  Valid Loss: 0.4395 | Valid Accuracy: 0.8775

--- Epoch 20/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.14it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.37it/s]


Epoch 20 Summary:
  Train Loss: 0.0921 | Train Accuracy: 0.9688
  Valid Loss: 0.4508 | Valid Accuracy: 0.8775

--- Epoch 21/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.27it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.31it/s]


Epoch 21 Summary:
  Train Loss: 0.0731 | Train Accuracy: 0.9756
  Valid Loss: 0.4440 | Valid Accuracy: 0.8800

--- Epoch 22/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  4.47it/s]


Epoch 22 Summary:
  Train Loss: 0.0798 | Train Accuracy: 0.9694
  Valid Loss: 0.4484 | Valid Accuracy: 0.8825
Learning rate reduced from 1e-05 to 1.0000000000000002e-06

--- Epoch 23/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.30it/s]


Epoch 23 Summary:
  Train Loss: 0.0864 | Train Accuracy: 0.9706
  Valid Loss: 0.4544 | Valid Accuracy: 0.8850

--- Epoch 24/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.23it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  3.61it/s]


Epoch 24 Summary:
  Train Loss: 0.0924 | Train Accuracy: 0.9637
  Valid Loss: 0.4461 | Valid Accuracy: 0.8850

--- Epoch 25/30 ---


Training: 100%|██████████| 25/25 [00:20<00:00,  1.23it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.24it/s]


Epoch 25 Summary:
  Train Loss: 0.0841 | Train Accuracy: 0.9731
  Valid Loss: 0.4449 | Valid Accuracy: 0.8850

--- Epoch 26/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.28it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  4.56it/s]


Epoch 26 Summary:
  Train Loss: 0.0921 | Train Accuracy: 0.9681
  Valid Loss: 0.4548 | Valid Accuracy: 0.8850
Learning rate reduced from 1.0000000000000002e-06 to 1.0000000000000002e-07

--- Epoch 27/30 ---


Training: 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.24it/s]


Epoch 27 Summary:
  Train Loss: 0.0971 | Train Accuracy: 0.9613
  Valid Loss: 0.4511 | Valid Accuracy: 0.8825

--- Epoch 28/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.26it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.27it/s]


Epoch 28 Summary:
  Train Loss: 0.0736 | Train Accuracy: 0.9762
  Valid Loss: 0.4528 | Valid Accuracy: 0.8825

--- Epoch 29/30 ---


Training: 100%|██████████| 25/25 [00:22<00:00,  1.13it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.21it/s]


Epoch 29 Summary:
  Train Loss: 0.0821 | Train Accuracy: 0.9712
  Valid Loss: 0.4503 | Valid Accuracy: 0.8850

--- Epoch 30/30 ---


Training: 100%|██████████| 25/25 [00:19<00:00,  1.27it/s]
Validation: 100%|██████████| 7/7 [00:01<00:00,  6.41it/s]

Epoch 30 Summary:
  Train Loss: 0.0786 | Train Accuracy: 0.9744
  Valid Loss: 0.4484 | Valid Accuracy: 0.8875
Learning rate reduced from 1.0000000000000002e-07 to 1.0000000000000004e-08

--- Training Complete ---
Best validation accuracy achieved: 0.8975
Best model saved to 'best_model_v2.pth'





In [11]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import os
import json
from tqdm import tqdm

# --- Configuration ---
# Update these paths if they are different in your environment
TEST_IMG_DIR = '/content/hackathon_dataset/test'
MODEL_PATH = 'best_model_v2.pth' # <-- THE ONLY CHANGE NEEDED
OUTPUT_JSON_PATH = 'teamname_prediction_v2.json' # Saving to a new file

# Model and data settings (must match the training script)
IMAGE_SIZE = 32
BATCH_SIZE = 64

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# --- 1. Custom Dataset for Test Images ---
class TestDataset(Dataset):
    """Dataset for loading test images."""
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(root_dir) if os.path.isfile(os.path.join(root_dir, f))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        filename = self.image_files[idx]
        img_path = os.path.join(self.root_dir, filename)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        index = int(os.path.splitext(filename)[0])
        return image, index

# --- 2. Load Model ---
print(f"Loading model from '{MODEL_PATH}'...")
# Re-create the model architecture to match the one we trained
model = models.resnet18()
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 256),
    nn.ReLU(),
    nn.Dropout(0.5), # IMPORTANT: Must match the saved model's architecture
    nn.Linear(256, 1)
)

try:
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
except FileNotFoundError:
    print(f"❌ ERROR: Model file not found at '{MODEL_PATH}'.")
    exit()

model = model.to(device)
model.eval() # CRITICAL: Set model to evaluation mode

# --- 3. Prepare Test Data ---
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_dataset = TestDataset(root_dir=TEST_IMG_DIR, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Found {len(test_dataset)} images in the test directory.")

# --- 4. Generate Predictions ---
predictions = []
with torch.no_grad():
    for images, indices in tqdm(test_loader, desc="Predicting"):
        images = images.to(device)
        outputs = model(images)
        preds = (torch.sigmoid(outputs) > 0.5).squeeze().cpu().numpy().astype(int)
        indices = indices.cpu().numpy()

        for index, pred in zip(indices, preds):
            prediction_str = "fake" if pred == 1 else "real"
            predictions.append({"index": int(index), "prediction": prediction_str})

# --- 5. Save Output JSON ---
predictions.sort(key=lambda x: x['index'])

try:
    with open(OUTPUT_JSON_PATH, 'w') as f:
        json.dump(predictions, f, indent=4)
    print(f"\n✅ Success! Predictions saved to '{OUTPUT_JSON_PATH}'")
    print("\n--- Prediction Sample ---")
    print(json.dumps(predictions[:5], indent=4))
except Exception as e:
    print(f"❌ ERROR: Could not write JSON file. Details: {e}")


Using device: cpu
Loading model from 'best_model_v2.pth'...
Found 500 images in the test directory.


Predicting: 100%|██████████| 8/8 [00:02<00:00,  3.61it/s]


✅ Success! Predictions saved to 'teamname_prediction_v2.json'

--- Prediction Sample ---
[
    {
        "index": 1,
        "prediction": "fake"
    },
    {
        "index": 2,
        "prediction": "real"
    },
    {
        "index": 3,
        "prediction": "fake"
    },
    {
        "index": 4,
        "prediction": "real"
    },
    {
        "index": 5,
        "prediction": "fake"
    }
]



