## Import Library:

In [None]:
import os # untuk berinteraksi dengan sistem operasi, seperti untuk mengakses file dan direktori
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split # untuk membagi dataset menjadi subset training dan testing
import torch # Library utama PyTorch untuk komputasi tensor dan deep learning
from torch.utils.data import Dataset, DataLoader # Kelas untuk memanipulasi dataset dan membuat batch data untuk training
from torchvision import transforms # Modul untuk melakukan transformasi pada gambar seperti augmentasi data
import torch.nn as nn # Modul yang berisi berbagai komponen neural network seperti lapisan (layers), fungsi aktivasi, dll
import torch.optim as optim # Modul untuk algoritma optimasi, seperti SGD, Adam, dll
import torchvision.models as models
import time # Library untuk mengukur waktu eksekusi kode
from torch.autograd import Variable # Kelas yang membungkus tensor untuk melacak sejarah operasi dan menghitung gradient
from PIL import Image # Library untuk membuka, memanipulasi, dan menyimpan gambar
from sklearn.metrics import classification_report, confusion_matrix # Fungsi untuk menghitung dan menampilkan laporan klasifikasi dan confusion matrix
import cv2 # Library OpenCV untuk manipulasi gambar dan video
from torchvision.io import read_image

## Data Preparation:

In [None]:
data_path = '/kaggle/input/celeba/CelebA' # Menyimpan path ke direktori dataset CelebA

In [None]:
os.listdir(data_path) # Mengambil daftar semua file dan folder dalam direktori yang ditentukan

In [None]:
images_list = os.listdir(data_path +'/Images') # Mengambil daftar semua file dalam subdirektori Images dari data_path
images_list = [i for i in images_list if len(i) < 11] # Because there are Duplicate Images 'XXXXXX(1).jpg'
images_list.sort()

**Keterangan Kode:**

- images_list = [i for i in images_list if len(i) < 11]: Membuat daftar baru images_list yang hanya berisi nama file gambar dengan panjang kurang dari 11 karakter. Hal ini dilakukan untuk menghapus gambar duplikat yang memiliki format nama 'XXXXXX(1).jpg'.
- images_list.sort(): Mengurutkan daftar gambar images_list secara alfabetis

In [None]:
data = pd.DataFrame(pd.read_csv(data_path+'/list_attribute.txt', sep = '\s+', header = 0))#, index_col = 0))
data = data[['file_name', 'Male']]
new_data =  data[data['file_name'].isin(images_list)] # filter the data with available images only (5000 images)
new_data = new_data.replace(-1, 0)

In [None]:
data.info()

In [None]:
new_data.head()

In [None]:
new_data.info()

In [None]:
sns.countplot(x = new_data['Male']);

In [None]:
data.describe()

In [None]:
new_data.describe()

In [None]:
new_data_male = new_data[new_data['Male'] == 1]
new_data_female = new_data[new_data['Male'] == 0]
new_data_male = new_data_male.sample(frac = 1, random_state = 42)
new_data_female = new_data_female.sample(frac = 1, random_state = 42)

In [None]:
train_data = pd.concat([new_data_male[:1500], new_data_female[:1500]], axis = 0)
test_data = pd.concat([new_data_male[1500:], new_data_female[1500:]], axis = 0)
train_data.sort_index(inplace = True)
test_data.sort_index(inplace = True)
del new_data_male
del new_data_female

In [None]:
sns.countplot(x = train_data['Male']);

## Data Preprocessing:

In [None]:
class GenderDataset(Dataset):
    def __init__(self, data, image_folder_path, transform=None):
        self.data = data
        self.image_folder_path = image_folder_path
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = os.path.join(self.image_folder_path, self.data.iloc[idx, 0])
        image = Image.open(image_path).convert('RGB')
        gender = self.data.iloc[idx, 1]
        
        if self.transform:
            image = self.transform(image)
        
        return image, torch.tensor(gender, dtype = torch.long)

In [None]:
# Define the Transformations:
transforms = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.Resize(256),
        transforms.RandomRotation(45),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [None]:
train_set = GenderDataset(train_data, image_folder_path = os.path.join(data_path, "Images"), transform = transforms['train'])
train_loader = DataLoader(train_set, batch_size = 32, shuffle = True, num_workers = 2)

In [None]:
test_set = GenderDataset(test_data, os.path.join(data_path, "Images"), transform = transforms['test'])
test_loader = DataLoader(test_set, batch_size = 32, shuffle = False, num_workers = 2)

In [None]:
dataloaders = {'train': train_loader, 'test': test_loader}
dataset_sizes = {'train': len(train_set), 'test': len(test_set)}

## Architecture:

In [None]:
# ResNet
model = models.resnet101()
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 1)

In [None]:
# Define the Optimizer:
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

In [None]:
# Define the Loss Function:
criterion = nn.BCEWithLogitsLoss()  # More stable than BCELoss for binary classification

In [None]:
# Move the Model to GPU if Available:
use_gpu = torch.cuda.is_available()
if use_gpu:
    model = model.to('cuda')

## Modeling:

In [None]:
def train_model(model, dataloaders, dataset_sizes, criterion, optimizer, use_gpu = torch.cuda.is_available(), num_epochs = 50):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a Training and Validation Phase:
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train(True)  # Set model to Training Mode
            else:
                model.train(False)  # Set model to Evaluate Mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate Over Data:
            for data in dataloaders[phase]:
                # Get the Inputs:
                inputs, labels = data

                # Wrap them in Variable:
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # Zero the Parameter Gradients:
                optimizer.zero_grad()

                # Forward:
                outputs = model(inputs)
                outputs = outputs.squeeze()
                loss = criterion(outputs, labels.float())
                

                # Backward + Optimize Only if in Training Phase:
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # Statistics:
                running_loss += loss.item() * inputs.size(0)
                preds = (torch.sigmoid(outputs) > 0.5).float()
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # Deep Copy the Model:
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()
                state = {'model':model.state_dict(),'optim':optimizer.state_dict()}
                torch.save(state,'/kaggle/working/model_best.pth')
                
            if phase == 'test':
                last_acc = epoch_acc
                last_model_wts = model.state_dict()
                state = {'model':model.state_dict(),'optim':optimizer.state_dict()}
                torch.save(state,'/kaggle/working/model_last.pth')

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best Test Acc: {:4f}'.format(best_acc))

    # Load Best Model Weights:
    model.load_state_dict(best_model_wts)
    return model

In [None]:
use_gpu = torch.cuda.is_available()

if use_gpu:
  model = model.to("cuda")

In [None]:
model = train_model(model, dataloaders, dataset_sizes, criterion, optimizer, use_gpu, 25)

## Evaluation:

In [None]:
# # For if loading weights onto a model, otherwise keep commented
# model.load_state_dict(torch.load('/kaggle/working/model_best.pth')['model'])
# if torch.cuda.is_available():
#   model = model.to("cuda")

In [None]:
y_true = torch.empty((0)).cuda()
y_pred = torch.empty((0)).cuda()
for data in dataloaders['test']: 
      inputs, labels = data
      #print(labels)
      if torch.cuda.is_available():
          inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
      else:
          inputs, labels = Variable(inputs), Variable(labels)
      outputs = model(inputs)
      preds = (torch.sigmoid(outputs).squeeze() > 0.5).float()
      y_true = torch.cat((y_true, labels.data), -1)
      y_pred = torch.cat((y_pred, preds), -1)


In [None]:
y_true_np = y_true.detach().cpu().numpy()
y_pred_np = y_pred.detach().cpu().numpy()
print(classification_report(y_true_np, y_pred_np, digits = 4))

## Validation

In [None]:
model = models.resnet101()
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 1)

In [None]:
# model.load_state_dict(torch.load('/kaggle/input/resnet18_genderclassification/pytorch/best/1/BestModel_ResNet18_NoPretrain_BCEwLog_epoch25.pth')['model'])
model.load_state_dict(torch.load('/kaggle/working/model_best.pth')['model'])
if torch.cuda.is_available():
  model = model.to("cuda")

In [None]:
from torchvision import transforms
val_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

In [None]:
val_images_list = os.listdir('/kaggle/input/celeba/validation_images')
val_images_list.sort()

In [None]:
val_images_list

In [None]:
inf_time_start = time.time()
val_pred = torch.empty((0)).cuda()
for i in val_images_list:
    image_path = os.path.join('/kaggle/input/celeba/validation_images', i)
    image = Image.open(image_path).convert('RGB')
    inputs = val_transforms(image)
    inputs = Variable(inputs.cuda())
    inputs = inputs.unsqueeze(0)
    outputs = model(inputs)
      #print(outputs)
    preds = (torch.sigmoid(outputs).squeeze() > 0.5).float()
    val_pred = torch.cat((val_pred, preds.reshape(1)), -1)
print('prediction time for each image is {} s'.format((time.time()-inf_time_start)/len(val_images_list)))

In [None]:
val_pred