In [1]:
import boto3
from smart_open import open
import zipfile
import io
import os
import glob
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import models, transforms, datasets
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
from PIL import Image
from torch.utils.data import Dataset
from collections import defaultdict
from torchvision import transforms
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights

In [4]:
# Set up the S3 client
s3 = boto3.client('s3')

# Define the S3 bucket and key
bucket_name = 'ieee-dataport'
object_key = 'data/7540/StairNet.zip'

# Define the directory to save images
local_image_dir = os.path.expanduser('~/SageMaker/data/images')

# Function to check if data already exists
def data_exists(local_dir):
    # Check for the presence of any image files in the directory
    images = glob.glob(os.path.join(local_dir, '*'))
    if images:
        return True
    return False

# Function to stream and extract images
def stream_and_extract_images(bucket, key, local_dir):
    buffer_size = 10 * 1024 * 1024  # 10 MB buffer size for reading chunks

    # Use smart_open to stream the zip file
    with open(f's3://{bucket}/{key}', 'rb') as s3_file:
        # BufferedReader with a buffer size
        with zipfile.ZipFile(io.BufferedReader(s3_file, buffer_size=buffer_size)) as z:
            # Iterate over the files in the zip
            for file_info in z.infolist():
                if file_info.filename.endswith(('.png', '.jpg', '.jpeg')):
                    try:
                        with z.open(file_info) as image_file:
                            img_save_path = os.path.join(local_dir, os.path.basename(file_info.filename))
                            # Save only if the file doesn't exist
                            if not os.path.exists(img_save_path):
                                print(f"Extracting {file_info.filename}...")
                                with open(img_save_path, 'wb') as f:
                                    f.write(image_file.read())
                    except Exception as e:
                        print(f"Failed to extract {file_info.filename}: {e}")

# Create the local directory if it does not exist
os.makedirs(local_image_dir, exist_ok=True)

# Check if the data already exists
if data_exists(local_image_dir):
    print(f"Data already exists in {local_image_dir}. Skipping download and extraction.")
else:
    print(f"Data does not exist in {local_image_dir}. Downloading and extracting...")
    # Stream and extract images
    stream_and_extract_images(bucket_name, object_key, local_image_dir)

print(f"Data check complete. Images are available in {local_image_dir}")


Data already exists in /home/ec2-user/SageMaker/data/images. Skipping download and extraction.
Data check complete. Images are available in /home/ec2-user/SageMaker/data/images


In [5]:
# Define the directory containing the images
image_directory = '/home/ec2-user/SageMaker/data/images'

# Initialize counters for each suffix
lg_count = 0
lg_is_count = 0
is_count = 0
is_lg_count = 0

# List all files in the directory
for filename in os.listdir(image_directory):
    # Check and count for each suffix
    if filename.endswith(' LG.jpg'):
        lg_count += 1
    elif filename.endswith(' LGIS.jpg'):
        lg_is_count += 1
    elif filename.endswith(' IS.jpg'):
        is_count += 1
    elif filename.endswith(' ISLG.jpg'):
        is_lg_count += 1

# Print the counts
print(f"Number of images ending with 'LG': {lg_count}")
print(f"Number of images ending with 'LG-IS': {lg_is_count}")
print(f"Number of images ending with 'IS': {is_count}")
print(f"Number of images ending with 'IS-LG': {is_lg_count}")


Number of images ending with 'LG': 442360
Number of images ending with 'LG-IS': 15888
Number of images ending with 'IS': 48179
Number of images ending with 'IS-LG': 9025


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

torch.manual_seed(42)
np.random.seed(42)

# Rest of your code will go here

Using device: cpu


## Step 2:

- First, let's define the dataset structure 

dataset_structure = {
    'IS': 48179,
    'ISLG': 9025,
    'LG': 442360,
    'LGIS': 15888
}

total_samples = sum(dataset_structure.values())
print(f"Total samples: {total_samples}")

- Now, let's create functions to split the data according to the given percentages:

In [None]:
# def split_dataset(dataset_structure, train_pct=0.895, val_pct=0.035, test_pct=0.07):
#     train_set, val_set, test_set = {}, {}, {}
    
#     for class_name, total_samples in dataset_structure.items():
#         train_samples = int(total_samples * train_pct)
#         val_samples = int(total_samples * val_pct)
#         test_samples = total_samples - train_samples - val_samples
        
#         train_set[class_name] = train_samples
#         val_set[class_name] = val_samples
#         test_set[class_name] = test_samples
    
#     return train_set, val_set, test_set

# train_set, val_set, test_set = split_dataset(dataset_structure)

# print("Training set:", train_set)
# print("Validation set:", val_set)
# print("Test set:", test_set)

In [3]:
def split_data(file_list, train_pct=0.895, val_pct=0.035, test_pct=0.07):
    random.shuffle(file_list)
    total = len(file_list)
    train_end = int(total * train_pct)
    val_end = train_end + int(total * val_pct)
    
    return file_list[:train_end], file_list[train_end:val_end], file_list[val_end:]

- For loading the data, we'll need to create a custom dataset class

In [4]:
class StairNetDataset(Dataset):
    def __init__(self, root_dir, file_list, transform=None):
        self.root_dir = root_dir
        self.file_list = file_list
        self.transform = transform
        self.class_to_idx = {'IS': 0, 'ISLG': 1, 'LG': 2, 'LGIS': 3}

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        img_name = self.file_list[idx]
        img_path = os.path.join(self.root_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        
        # Extract label from filename
        parts = img_name.split()
        label = parts[-1].split('.')[0]  # Get the last part before the file extension
        
        # Handle the case where the label might be two words (e.g., 'LG IS')
        if len(parts) > 2 and parts[-2] in ['LG', 'IS']:
            label = f"{parts[-2]} {label}"
        
        label_idx = self.class_to_idx[label]

        if self.transform:
            image = self.transform(image)

        return image, label_idx

- creat a stratified dataset
- setup the dataloaders

In [13]:
def split_data_stratified(file_list, train_pct=0.895, val_pct=0.035, test_pct=0.07):
    # Group files by class
    class_files = defaultdict(list)
    for file in file_list:
        parts = file.split()
        label = parts[-1].split('.')[0]
        if len(parts) > 2 and parts[-2] in ['LG', 'IS']:
            label = f"{parts[-2]} {label}"
        class_files[label].append(file)
    
    train_files, val_files, test_files = [], [], []
    
    for class_label, files in class_files.items():
        random.shuffle(files)
        n_files = len(files)
        n_train = int(n_files * train_pct)
        n_val = int(n_files * val_pct)
        
        train_files.extend(files[:n_train])
        val_files.extend(files[n_train:n_train+n_val])
        test_files.extend(files[n_train+n_val:])
    
    random.shuffle(train_files)
    random.shuffle(val_files)
    random.shuffle(test_files)
    
    return train_files, val_files, test_files

# Define transforms
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

# Set up data
root_dir = '/home/ec2-user/SageMaker/data/images'  
all_files = os.listdir(root_dir)
image_files = [f for f in all_files if f.endswith('.jpg')]

# Split data
train_files, val_files, test_files = split_data_stratified(image_files)

# Create datasets
train_dataset = StairNetDataset(root_dir, train_files, transform=train_transform)
val_dataset = StairNetDataset(root_dir, val_files, transform=val_transform)
test_dataset = StairNetDataset(root_dir, test_files, transform=val_transform)

# Create data loaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

Number of training samples: 461328
Number of validation samples: 18039
Number of test samples: 36085


- check the distribution (caution: takes very very long time to count! so run this if you have a lot of time availale)

In [None]:
## Print class distribution in each split

# def print_class_distribution(dataset):
#     class_counts = defaultdict(int)
#     for _, label in dataset:
#         class_counts[label] += 1
#     total = sum(class_counts.values())
#     for label, count in class_counts.items():
#         print(f"Class {label}: {count} ({count/total:.2%})")

# print("\nTraining set class distribution:")
# print_class_distribution(train_dataset)
# print("\nValidation set class distribution:")
# print_class_distribution(val_dataset)
# print("\nTest set class distribution:")
# print_class_distribution(test_dataset)

In [14]:
class CustomMobileNetV2(nn.Module):
    def __init__(self, num_classes=4, dropout_rate=0.2):
        super(CustomMobileNetV2, self).__init__()
        
        # Load pretrained MobileNetV2
        self.model = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1)
        
        # Freeze first 5 layers
        for i, param in enumerate(self.model.features[:5].parameters()):
            param.requires_grad = False
        
        # Modify the classifier
        in_features = self.model.classifier[1].in_features
        self.model.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
# !pip install torchsummary

In [15]:
# Create the model
model = CustomMobileNetV2(num_classes=4, dropout_rate=0.2)

# Print model summary
from torchsummary import summary
summary(model, (3, 224, 224), device='cpu')

# Verify number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params}")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 112, 112]             864
       BatchNorm2d-2         [-1, 32, 112, 112]              64
             ReLU6-3         [-1, 32, 112, 112]               0
            Conv2d-4         [-1, 32, 112, 112]             288
       BatchNorm2d-5         [-1, 32, 112, 112]              64
             ReLU6-6         [-1, 32, 112, 112]               0
            Conv2d-7         [-1, 16, 112, 112]             512
       BatchNorm2d-8         [-1, 16, 112, 112]              32
  InvertedResidual-9         [-1, 16, 112, 112]               0
           Conv2d-10         [-1, 96, 112, 112]           1,536
      BatchNorm2d-11         [-1, 96, 112, 112]             192
            ReLU6-12         [-1, 96, 112, 112]               0
           Conv2d-13           [-1, 96, 56, 56]             864
      BatchNorm2d-14           [-1, 96,

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# # for optimization
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma=0.1)

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device='cpu'):
    model.to(device)
    best_val_accuracy = 0.0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()

        train_loss = train_loss / len(train_loader.dataset)
        train_accuracy = train_correct / train_total

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()

        val_loss = val_loss / len(val_loader.dataset)
        val_accuracy = val_correct / val_total

        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
        
        #optimization
        # Add this line to update the scheduler
#         scheduler.step(val_loss)

        # Save the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pth')

    print(f'Best validation accuracy: {best_val_accuracy:.4f}')

# Use the function
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)

Epoch 1/10:   4%|▍         | 151/3605 [23:12<8:46:30,  9.15s/it]