In [7]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.session import Session
import boto3

boto_sess = boto3.Session(region_name="us-east-1")
sm_sess = Session(boto_session=boto_sess)

sess = sagemaker.Session()
bucket_name = "kulit-ai-dataset-real-fiki-2025" 
role = "arn:aws:iam::564415061686:role/service-role/AmazonSageMakerAdminIAMExecutionRole"

print(f"Target Bucket: {bucket_name}")

estimator = PyTorch(
    entry_point='sagemaker_train.py',  
    source_dir='../src',              
    role=role,
    framework_version='1.13.1',        
    py_version='py39',
    instance_count=1,
    instance_type='ml.g4dn.xlarge',    
    hyperparameters={
        'epochs': 100,
        'batch-size': 8,
        'patience': 5 
    },
    sagemaker_session=sm_sess
)


estimator.fit({
    'train': f's3://{bucket_name}/data/train',
    'test': f's3://{bucket_name}/data/valid'
})

INFO:botocore.credentials:Found credentials in environment variables.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Target Bucket: kulit-ai-dataset-real-fiki-2025


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2025-12-30-18-09-16-410


2025-12-30 18:09:24 Starting - Starting the training job
2025-12-30 18:09:24 Pending - Training job waiting for capacity...
2025-12-30 18:09:59 Pending - Preparing the instances for training...
2025-12-30 18:10:24 Downloading - Downloading input data...
2025-12-30 18:10:59 Downloading - Downloading the training image..............bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
2025-12-30 18:13:49,624 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2025-12-30 18:13:49,647 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-12-30 18:13:49,661 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2025-12-30 18:13:49,665 sagemaker_pytorch_container.training INFO     Invoking user training script.
2025-12-30 18:13:50,956 sagemaker-training-tool

In [8]:
import os
import tarfile
import boto3
from urllib.parse import urlparse

# 1. Ambil S3 URI dari estimator
model_s3_uri = estimator.model_data
print(f"üì¶ Model di S3: {model_s3_uri}")

# 2. Parsing URI
parsed = urlparse(model_s3_uri)
bucket = parsed.netloc
key = parsed.path.lstrip('/')

# 3. Download
local_tar = "model.tar.gz"
s3 = boto3.client("s3")

print("‚¨áÔ∏è Mendownload model...")
s3.download_file(bucket, key, local_tar)

# 4. Extract
extract_dir = "model_hasil_training"
os.makedirs(extract_dir, exist_ok=True)

print("üìÇ Mengekstrak model...")
with tarfile.open(local_tar, "r:gz") as tar:
    tar.extractall(path=extract_dir)

print("‚úÖ Selesai!")
print(f"‚û°Ô∏è Gunakan model: {extract_dir}/model.pth")


üì¶ Model di S3: s3://sagemaker-us-east-1-564415061686/pytorch-training-2025-12-30-18-09-16-410/output/model.tar.gz
‚¨áÔ∏è Mendownload model...
üìÇ Mengekstrak model...
‚úÖ Selesai!
‚û°Ô∏è Gunakan model: model_hasil_training/model.pth


In [6]:
import argparse
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
import copy
from sklearn.metrics import classification_report, accuracy_score

def train():
    print("üöÄ Starting Training ViT-B/16 (SageMaker)")

    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=30)
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--batch-size', type=int, default=16)
    parser.add_argument('--patience', type=int, default=5)

    parser.add_argument(
    '--train',
    type=str,
    default=os.environ.get('SM_CHANNEL_TRAIN', './data/train')
)

    parser.add_argument(
        '--test',
        type=str,
        default=os.environ.get('SM_CHANNEL_TEST', './data/valid')
    )

    parser.add_argument(
        '--model-dir',
        type=str,
        default=os.environ.get('SM_MODEL_DIR', './saved_model')
    )

    args, _ = parser.parse_known_args()

    # safety check
    if args.model_dir is None:
        args.model_dir = "./saved_model"

    os.makedirs(args.model_dir, exist_ok=True)


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"‚öôÔ∏è Device: {device}")

    # ---------------- DATA ----------------
    IMG_SIZE = 224

    train_transform = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(0.1, 0.1, 0.1, 0.05),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    val_transform = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    train_dataset = datasets.ImageFolder(args.train, transform=train_transform)
    val_dataset = datasets.ImageFolder(args.test, transform=val_transform)
    class_names = train_dataset.classes

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size,
        shuffle=True, num_workers=4
    )

    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=args.batch_size,
        shuffle=False, num_workers=4
    )

    print(f"‚úÖ Classes: {class_names}")

    # ---------------- MODEL ----------------
    weights = models.ViT_B_16_Weights.IMAGENET1K_V1
    model = models.vit_b_16(weights=weights)

    # Freeze backbone
    for param in model.parameters():
        param.requires_grad = False

    # Replace head
    model.heads.head = nn.Linear(
        model.heads.head.in_features,
        len(class_names)
    )

    for param in model.heads.parameters():
        param.requires_grad = True

    model = model.to(device)

    # ---------------- TRAIN SETUP ----------------
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(
        model.heads.parameters(),
        lr=args.lr,
        weight_decay=1e-4
    )

    scaler = torch.cuda.amp.GradScaler()

    best_val_loss = float("inf")
    patience_counter = 0
    best_epoch = 0
    best_weights = copy.deepcopy(model.state_dict())
    best_preds, best_labels = None, None

    # ---------------- TRAIN LOOP ----------------
    for epoch in range(args.epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_loss += loss.item() * inputs.size(0)
            train_correct += (outputs.argmax(1) == labels).sum().item()

        train_loss /= len(train_dataset)
        train_acc = train_correct / len(train_dataset)

        # -------- VALIDATION --------
        model.eval()
        val_loss = 0.0
        all_preds, all_labels = [], []

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                with torch.cuda.amp.autocast():
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                preds = torch.argmax(outputs, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_loss /= len(val_dataset)
        val_acc = accuracy_score(all_labels, all_preds)

        # -------- PRINT --------
        print(
            f"Epoch [{epoch+1}/{args.epochs}] | "
            f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
            f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}"
        )

        # -------- EARLY STOP --------
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch + 1
            best_weights = copy.deepcopy(model.state_dict())
            best_preds = all_preds
            best_labels = all_labels
            patience_counter = 0
            print("‚úÖ Best model updated")
        else:
            patience_counter += 1
            print(f"‚ö†Ô∏è No improvement ({patience_counter}/{args.patience})")

            if patience_counter >= args.patience:
                print("üõë Early stopping triggered")
                break

    # ---------------- REPORT ----------------
    print("\nüìä CLASSIFICATION REPORT (BEST EPOCH)")
    print(f"üèÜ Best Epoch: {best_epoch}")
    print(classification_report(best_labels, best_preds, target_names=class_names))

    # ---------------- SAVE (BENAR UNTUK SAGEMAKER) ----------------
    save_path = os.path.join(args.model_dir, "model.pth")
    torch.save(
        {
            "model_state_dict": best_weights,
            "class_names": class_names,
            "img_size": IMG_SIZE,
            "model_name": "vit_b_16"
        },
        save_path
    )

    print(f"üíæ Model saved to {save_path}")
    print("‚úÖ Training completed")

if __name__ == "__main__":
    train()


üöÄ Starting Training ViT-B/16 (SageMaker)
‚öôÔ∏è Device: cpu


FileNotFoundError: [WinError 3] The system cannot find the path specified: './data/train'