In [1]:
import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision.models import resnet18
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
import time


In [2]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import kagglehub
import os

# Re-download the dataset if necessary
path = kagglehub.dataset_download("wiameelhafid/aiguard-split-data")
print("Dataset downloaded to:", path)
print("Contents of dataset folder:", os.listdir(path))


Dataset downloaded to: /kaggle/input/aiguard-split-data
Contents of dataset folder: ['split_data']


In [4]:
dataset_path = "/kaggle/input/aiguard-split-data"
print("Contents of dataset folder:", os.listdir(dataset_path))

Contents of dataset folder: ['split_data']


In [5]:
import os
from torchvision import datasets

# Correct dataset path
dataset_path = "/kaggle/input/aiguard-split-data/split_data"

# Check contents to confirm
print("Contents of dataset folder:", os.listdir(dataset_path))


Contents of dataset folder: ['val', 'test', 'train']


In [6]:
# Data transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224
    transforms.ToTensor(),  # Convert images to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

In [7]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Define a custom Dataset class for .npy files
class NumpyDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.data_files = sorted(os.listdir(data_dir))  # Assuming sorted order corresponds to labels

    def __len__(self):
        return len(self.data_files)

    def __getitem__(self, idx):
        file_path = os.path.join(self.data_dir, self.data_files[idx])
        data = np.load(file_path, allow_pickle=True).item()  # Assuming saved as dict with 'image' and 'label'
        image = data['image']
        label = data['label']
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# Dataset paths
train_path = "/kaggle/input/aiguard-split-data/split_data/train"
val_path = "/kaggle/input/aiguard-split-data/split_data/val"

# Define transformations
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert numpy array to PyTorch tensor
    transforms.Normalize((0.5,), (0.5,))  # Normalize to range [-1, 1]
])

# Load datasets
train_dataset = NumpyDataset(train_path, transform=transform)
val_dataset = NumpyDataset(val_path, transform=transform)

# Set batch size
batch_size = 32

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Print dataset details
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")


Number of training samples: 626
Number of validation samples: 626


In [8]:
import torch
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model, loss function, and optimizer
model = torch.hub.load("pytorch/vision:v0.13.1", "resnet18", weights="IMAGENET1K_V1")
model.fc = torch.nn.Linear(model.fc.in_features, 10)  # Update for 10 classes (adjust based on your dataset)
model = model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Initialize lists to store metrics
train_losses, train_accuracies = [], []
val_losses, val_accuracies = [], []

Using device: cuda


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.13.1


In [9]:
import os

train_dir = "/kaggle/input/aiguard-split-data/split_data/train"
print("Train directory contents:", os.listdir(train_dir))


Train directory contents: ['train_batch_118.npy', 'train_batch_273.npy', 'train_batch_119.npy', 'train_batch_35.npy', 'train_batch_282.npy', 'train_batch_154.npy', 'train_batch_523.npy', 'train_batch_120.npy', 'train_batch_493.npy', 'train_batch_415.npy', 'train_batch_449.npy', 'train_batch_346.npy', 'train_batch_469.npy', 'train_batch_564.npy', 'train_batch_495.npy', 'train_batch_552.npy', 'train_batch_19.npy', 'train_batch_543.npy', 'train_batch_529.npy', 'train_batch_606.npy', 'train_batch_539.npy', 'train_batch_201.npy', 'train_batch_497.npy', 'train_batch_522.npy', 'train_batch_554.npy', 'train_batch_116.npy', 'train_batch_9.npy', 'train_batch_57.npy', 'train_batch_618.npy', 'train_batch_182.npy', 'train_batch_350.npy', 'train_batch_365.npy', 'train_batch_341.npy', 'train_batch_134.npy', 'train_batch_388.npy', 'train_batch_541.npy', 'train_batch_598.npy', 'train_batch_555.npy', 'train_batch_357.npy', 'train_batch_610.npy', 'train_batch_488.npy', 'train_batch_337.npy', 'train_batch

In [10]:
import numpy as np

# File paths
images_path = '/kaggle/input/aiguard-split-data/split_data/test/test_batch_0.npy/test_images_batch_0.npy'
labels_path = '/kaggle/input/aiguard-split-data/split_data/test/test_batch_0.npy/test_labels_batch_0.npy'

# Load the data
test_images = np.load(images_path)
test_labels = np.load(labels_path)

# Check the shapes and a sample of data
print(f"Test images shape: {test_images.shape}")
print(f"Test labels shape: {test_labels.shape}")

# Optional: Print out a small sample of the data
print(f"Sample images data:\n{test_images[:5]}")
print(f"Sample labels data:\n{test_labels[:5]}")


Test images shape: (20, 224, 224, 3)
Test labels shape: (20,)
Sample images data:
[[[[0.         0.         0.        ]
   [0.         0.         0.        ]
   [0.         0.         0.        ]
   ...
   [0.         0.         0.        ]
   [0.         0.         0.        ]
   [0.         0.         0.        ]]

  [[0.         0.         0.        ]
   [0.         0.         0.        ]
   [0.         0.         0.        ]
   ...
   [0.         0.         0.        ]
   [0.         0.         0.        ]
   [0.         0.         0.        ]]

  [[0.         0.         0.        ]
   [0.         0.         0.        ]
   [0.         0.         0.        ]
   ...
   [0.         0.         0.        ]
   [0.         0.         0.        ]
   [0.         0.         0.        ]]

  ...

  [[0.00392157 0.00392157 0.        ]
   [0.00392157 0.00392157 0.        ]
   [0.00392157 0.00392157 0.00392157]
   ...
   [0.01960784 0.01960784 0.01960784]
   [0.00392157 0.00392157 0.00392157]
   

In [11]:
import os
import numpy as np

# Function to load data in smaller batches
def load_data_in_batches(folder_path, batch_size=10):
    all_images = []
    all_labels = []
    
    # List all subfolders (batches)
    subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
    
    for subfolder in subfolders:
        batch_path = os.path.join(folder_path, subfolder)
        
        # List all image and label files in the subfolder
        image_files = [f for f in os.listdir(batch_path) if 'images_batch' in f]
        label_files = [f for f in os.listdir(batch_path) if 'labels_batch' in f]
        
        # Load the batches in chunks
        for image_file, label_file in zip(image_files, label_files):
            image_path = os.path.join(batch_path, image_file)
            label_path = os.path.join(batch_path, label_file)
            
            # Load image and label batch
            batch_images = np.load(image_path)
            batch_labels = np.load(label_path)
            
            all_images.append(batch_images)
            all_labels.append(batch_labels)
            
            # If we have loaded enough data, yield the batch and clear memory
            if len(all_images) >= batch_size:
                yield np.concatenate(all_images, axis=0), np.concatenate(all_labels, axis=0)
                all_images = []
                all_labels = []
    
    # If there are remaining images, yield them too
    if len(all_images) > 0:
        yield np.concatenate(all_images, axis=0), np.concatenate(all_labels, axis=0)

# Function to load data from train, val, and test folders
def load_all_data(train_folder, val_folder, test_folder, batch_size=10):
    train_data = load_data_in_batches(train_folder, batch_size)
    val_data = load_data_in_batches(val_folder, batch_size)
    test_data = load_data_in_batches(test_folder, batch_size)
    
    return train_data, val_data, test_data

# Define paths
train_folder = '/kaggle/input/aiguard-split-data/split_data/train'
val_folder = '/kaggle/input/aiguard-split-data/split_data/val'
test_folder = '/kaggle/input/aiguard-split-data/split_data/test'

# Load data in batches
train_data, val_data, test_data = load_all_data(train_folder, val_folder, test_folder, batch_size=10)

# Example of how to access the data in batches
for batch_images, batch_labels in train_data:
    print(f"Train batch images shape: {batch_images.shape}")
    print(f"Train batch labels shape: {batch_labels.shape}")
    break  # Only print the first batch for inspection


Train batch images shape: (640, 224, 224, 3)
Train batch labels shape: (640,)


In [12]:
num_epochs = 5  # Adjust as needed
for epoch in range(num_epochs):
    start_time = time.time()
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Training phase
    model.train()
    running_loss, correct_preds, total_preds = 0.0, 0, 0
    for images, labels in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}", ncols=100):
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)
    
    train_loss = running_loss / len(train_loader)
    train_accuracy = correct_preds / total_preds * 100
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)
    print(f"Training - Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%")
    
    # Validation phase
    model.eval()
    val_loss, correct_preds, total_preds = 0.0, 0, 0
    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc="Validating", ncols=100):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)
    
    val_loss /= len(val_loader)
    val_accuracy = correct_preds / total_preds * 100
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.2f}%")
    
    epoch_time = time.time() - start_time
    print(f"Epoch completed in {epoch_time:.2f} seconds.")




Epoch 1/5


Training Epoch 1:   0%|                                                      | 0/20 [00:00<?, ?it/s]


IsADirectoryError: Caught IsADirectoryError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/tmp/ipykernel_69/4001607439.py", line 19, in __getitem__
    data = np.load(file_path, allow_pickle=True).item()  # Assuming saved as dict with 'image' and 'label'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/numpy/lib/npyio.py", line 427, in load
    fid = stack.enter_context(open(os_fspath(file), "rb"))
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
IsADirectoryError: [Errno 21] Is a directory: '/kaggle/input/aiguard-split-data/split_data/train/train_batch_28.npy'


In [None]:

# Plotting Loss and Accuracy
plt.figure(figsize=(12, 6))

# Loss plot
plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_losses, label="Train Loss", color="blue")
plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss", color="red")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss vs Epochs")
plt.legend()

# Accuracy plot
plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), train_accuracies, label="Train Accuracy", color="blue")
plt.plot(range(1, num_epochs + 1), val_accuracies, label="Validation Accuracy", color="red")
plt.xlabel("Epochs")
plt.ylabel("Accuracy (%)")
plt.title("Accuracy vs Epochs")
plt.legend()

plt.tight_layout()
plt.show()