<a href="https://colab.research.google.com/github/SangminAhn21/DL_Study/blob/main/Kaggle/Facial_Keypoint_Detection/FKD_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pwd

/content


In [3]:
%cd drive/MyDrive/Colab Notebooks/Facial_Keypoint_Detection

/content/drive/MyDrive/Colab Notebooks/Facial_Keypoint_Detection


In [4]:
pip install kaggle



In [5]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 66 bytes


In [6]:
!kaggle competitions download -c facial-keypoints-detection

IdLookupTable.csv: Skipping, found more recently modified local copy (use --force to force download)
SampleSubmission.csv: Skipping, found more recently modified local copy (use --force to force download)
training.zip: Skipping, found more recently modified local copy (use --force to force download)
test.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
!unzip training.zip
!unzip test.zip

Archive:  training.zip
replace training.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Archive:  test.zip
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [8]:
import models, utils

In [9]:
import numpy as np
import pandas as pd



training_pd = pd.read_csv('training.csv')
test_pd = pd.read_csv('test.csv')

training_pd = training_pd.fillna(method='ffill')

training = training_pd.to_numpy()
test = test_pd.to_numpy()

train_image = training[:, -1]
train_key = training[:, :-1].astype('float64')
test_image = test[:, 1]

train_image = np.array([np.array([int(pixel) for pixel in image.split()]).\
                        reshape(96, 96) for image in train_image])
test_image = np.array([np.array([int(pixel) for pixel in image.split()]).\
                       reshape(96, 96) for image in test_image])

In [10]:
pip install ray



In [12]:
pip install -U tensorboardx



In [17]:
from utils import FaceDataset, RMSELoss
from functools import partial
from models import CNN
import numpy as np
import torch
from torch.utils.data import DataLoader
import os
from filelock import FileLock
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler


def cnn_train(config, data, checkpoint_dir=None, data_dir=None):
    if torch.cuda.is_available():
        DEVICE = torch.device('cuda')
    else:
        DEVICE = torch.device('cpu')
    print('Using PyTorch version:', torch.__version__, ' Device: ', DEVICE)

    model = CNN(config['l1'], config['l2']).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr = config['lr'])
    criterion = RMSELoss()
    print(model)


    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)


    data_dir = os.path.abspath("./data")
    dataset = FaceDataset(data[0], data[1])

    lengths = [int(len(dataset)*0.8), len(dataset) - int(len(dataset)*0.8)]
    train_data, val_data = torch.utils.data.random_split(dataset, lengths)

    train_loader = DataLoader(dataset=train_data,
                          batch_size=config['batch_size'],
                          shuffle=True,
                          num_workers=2)
    val_loader = DataLoader(dataset=val_data,
                        batch_size=config['batch_size'],
                        shuffle=True,
                        num_workers=2)

    # model.train()
    for Epoch in range(10):
        # running_loss = 0.0
        # epoch_steps = 0
        for batch_idx, (image, key) in enumerate(train_loader):
            image = image.to(DEVICE)
            key = key.to(DEVICE)
            optimizer.zero_grad()
            output = model(image)
            loss = criterion(output, key)
            loss.backward()
            optimizer.step()

            # running_loss += loss.item()
            # epoch_steps += 1

            if batch_idx % 100 == 0:
                print("Train Epoch: {} [{}/{}({:.0f}%)]\tTrain Loss: {:.6f}".format(
                    Epoch, batch_idx * len(image),
                    len(train_loader.dataset), 100. * batch_idx / len(train_loader),
                    loss.item()))
                # running_loss = 0.0

        # model.eval()
        val_loss = 0.0
        for image, key in val_loader:
            with torch.no_grad():
                image = image.to(DEVICE)
                key = key.to(DEVICE)
                output = model(image)
                val_loss += criterion(output, key).item()

        val_loss /= len(val_loader.dataset)
        val_loss *= config['batch_size']
        print('\n[EPOCH: {}], \tVal Loss: {:.4f}\n'.
        format(Epoch, val_loss))

        with tune.checkpoint_dir(Epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=val_loss)
    print("Finished Training")

In [18]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=1):
    data_dir = os.path.abspath("./data")  # 특정 경로에 대해 절대 경로 얻기
    config = {
        'l1': tune.sample_from(lambda _: 2**np.random.randint(3, 8)),
        'l2': tune.sample_from(lambda _: 2**np.random.randint(3, 8)),
        'lr': tune.loguniform(1e-3, 1e-1),
        'batch_size': tune.choice([8, 16, 32, 64])
    }
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "training_iteration"])
    
    result = tune.run(
        tune.with_parameters(partial(cnn_train, data_dir=data_dir), data=(train_image, train_key)),
        resources_per_trial={'cpu': 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))

    best_trained_model = CNN(best_trial.config["l1"], best_trial.config["l2"])
    if torch.cuda.is_available():
        DEVICE = torch.device('cuda')
    else:
        DEVICE = torch.device('cpu')
    best_trained_model.to(DEVICE)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=5, max_num_epochs=10, gpus_per_trial=1)

== Status ==
Current time: 2021-11-18 10:44:47 (running for 00:00:00.28)
Memory usage on this node: 4.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/tune_with_parameters_2021-11-18_10-44-47
Number of trials: 5/5 (5 PENDING)
+----------------------------------+----------+-------+--------------+------+------+------------+
| Trial name                       | status   | loc   |   batch_size |   l1 |   l2 |         lr |
|----------------------------------+----------+-------+--------------+------+------+------------|
| tune_with_parameters_8c63e_00000 | PENDING  |       |           16 |   64 |   32 | 0.0517826  |
| tune_with_parameters_8c63e_00001 | PENDING  |       |            8 |  128 |   32 | 0.00157046 |
| tune_with_parameters_8c63e_00002 | PENDING  |       |   



[2m[36m(ImplicitFunc pid=8583)[0m Using PyTorch version: 1.10.0+cu111  Device:  cuda
== Status ==
Current time: 2021-11-18 10:44:52 (running for 00:00:05.36)
Memory usage on this node: 5.3/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/tune_with_parameters_2021-11-18_10-44-47
Number of trials: 5/5 (4 PENDING, 1 RUNNING)
+----------------------------------+----------+-----------------+--------------+------+------+------------+
| Trial name                       | status   | loc             |   batch_size |   l1 |   l2 |         lr |
|----------------------------------+----------+-----------------+--------------+------+------+------------|
| tune_with_parameters_8c63e_00000 | RUNNING  | 172.28.0.2:8583 |           16 |   64 |   32 | 0.0517826  |
| tune_with_p

[2m[36m(pid=8583)[0m   self.x_data = (torch.from_numpy(x)/255.).type('torch.FloatTensor')


Result for tune_with_parameters_8c63e_00000:
  date: 2021-11-18_10-44-56
  done: false
  experiment_id: 651f6b983d7c426192cc6e551635445a
  hostname: f3af5fe45eb5
  iterations_since_restore: 1
  loss: 3.5893895494176986
  node_ip: 172.28.0.2
  pid: 8583
  should_checkpoint: true
  time_since_restore: 6.173115968704224
  time_this_iter_s: 6.173115968704224
  time_total_s: 6.173115968704224
  timestamp: 1637232296
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 8c63e_00000
  
[2m[36m(ImplicitFunc pid=8583)[0m 
[2m[36m(ImplicitFunc pid=8583)[0m [EPOCH: 0], 	Val Loss: 3.5894
[2m[36m(ImplicitFunc pid=8583)[0m 
== Status ==
Current time: 2021-11-18 10:44:58 (running for 00:00:11.31)
Memory usage on this node: 5.7/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -3.5893895494176986
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)



[2m[36m(ImplicitFunc pid=8822)[0m Using PyTorch version: 1.10.0+cu111  Device:  cuda
== Status ==
Current time: 2021-11-18 10:45:28 (running for 00:00:40.92)
Memory usage on this node: 5.4/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 8.000: -3.1274511107316254 | Iter 4.000: -3.2250398730555325 | Iter 2.000: -3.2247190624264115 | Iter 1.000: -3.5893895494176986
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/tune_with_parameters_2021-11-18_10-44-47
Number of trials: 5/5 (3 PENDING, 1 RUNNING, 1 TERMINATED)
+----------------------------------+------------+-----------------+--------------+------+------+------------+--------+----------------------+
| Trial name                       | status     | loc             |   batch_size |   l1 |   l2 |         lr |   loss |   training_iteration |
|----------------------------------+------------+-----------------+--------------+-----

[2m[36m(pid=8822)[0m   self.x_data = (torch.from_numpy(x)/255.).type('torch.FloatTensor')


== Status ==
Current time: 2021-11-18 10:45:33 (running for 00:00:45.96)
Memory usage on this node: 5.6/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 8.000: -3.1274511107316254 | Iter 4.000: -3.2250398730555325 | Iter 2.000: -3.2247190624264115 | Iter 1.000: -3.5893895494176986
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/tune_with_parameters_2021-11-18_10-44-47
Number of trials: 5/5 (3 PENDING, 1 RUNNING, 1 TERMINATED)
+----------------------------------+------------+-----------------+--------------+------+------+------------+--------+----------------------+
| Trial name                       | status     | loc             |   batch_size |   l1 |   l2 |         lr |   loss |   training_iteration |
|----------------------------------+------------+-----------------+--------------+------+------+------------+--------+----------------------|
| tune_with_parameters_8c63e_0000



[2m[1m[36m(scheduler +9m6s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[2m[36m(ImplicitFunc pid=9066)[0m Using PyTorch version: 1.10.0+cu111  Device:  cuda
== Status ==
Current time: 2021-11-18 10:46:26 (running for 00:01:39.34)
Memory usage on this node: 5.4/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: -2.824658516620068 | Iter 4.000: -2.934492623890545 | Iter 2.000: -3.0742991034866227 | Iter 1.000: -3.312238621880822
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/tune_with_parameters_2021-11-18_10-44-47
Number of trials: 5/5 (2 PENDING, 1 RUNNING, 2 TERMINATED)
+----------------------------------+------------+-----------------+--------------+------+------+------------+---------+----------------------+
| Trial name                       | status     | loc             |   batch_size |   l

[2m[36m(pid=9066)[0m   self.x_data = (torch.from_numpy(x)/255.).type('torch.FloatTensor')


Result for tune_with_parameters_8c63e_00002:
  date: 2021-11-18_10-46-28
  done: true
  experiment_id: 6590e89c0c924260a2d3e1c8c2b65687
  hostname: f3af5fe45eb5
  iterations_since_restore: 1
  loss: 4.38415901779283
  node_ip: 172.28.0.2
  pid: 9066
  should_checkpoint: true
  time_since_restore: 4.256778240203857
  time_this_iter_s: 4.256778240203857
  time_total_s: 4.256778240203857
  timestamp: 1637232388
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 8c63e_00002
  
[2m[36m(ImplicitFunc pid=9066)[0m 
[2m[36m(ImplicitFunc pid=9066)[0m [EPOCH: 0], 	Val Loss: 4.3842
[2m[36m(ImplicitFunc pid=9066)[0m 




[2m[36m(ImplicitFunc pid=9118)[0m Using PyTorch version: 1.10.0+cu111  Device:  cuda
== Status ==
Current time: 2021-11-18 10:46:32 (running for 00:01:45.11)
Memory usage on this node: 5.1/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: -2.824658516620068 | Iter 4.000: -2.934492623890545 | Iter 2.000: -3.0742991034866227 | Iter 1.000: -3.5893895494176986
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/tune_with_parameters_2021-11-18_10-44-47
Number of trials: 5/5 (1 PENDING, 1 RUNNING, 3 TERMINATED)
+----------------------------------+------------+-----------------+--------------+------+------+------------+---------+----------------------+
| Trial name                       | status     | loc             |   batch_size |   l1 |   l2 |         lr |    loss |   training_iteration |
|----------------------------------+------------+-----------------+--------------+-----

[2m[36m(pid=9118)[0m   self.x_data = (torch.from_numpy(x)/255.).type('torch.FloatTensor')


Result for tune_with_parameters_8c63e_00003:
  date: 2021-11-18_10-46-36
  done: true
  experiment_id: f21a0934f73742b1af626fb9d91a2fca
  hostname: f3af5fe45eb5
  iterations_since_restore: 1
  loss: 5.121710481034948
  node_ip: 172.28.0.2
  pid: 9118
  should_checkpoint: true
  time_since_restore: 5.925897836685181
  time_this_iter_s: 5.925897836685181
  time_total_s: 5.925897836685181
  timestamp: 1637232396
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 8c63e_00003
  
[2m[36m(ImplicitFunc pid=9118)[0m 
[2m[36m(ImplicitFunc pid=9118)[0m [EPOCH: 0], 	Val Loss: 5.1217
[2m[36m(ImplicitFunc pid=9118)[0m 




== Status ==
Current time: 2021-11-18 10:46:37 (running for 00:01:50.53)
Memory usage on this node: 4.2/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: -2.824658516620068 | Iter 4.000: -2.934492623890545 | Iter 2.000: -3.0742991034866227 | Iter 1.000: -3.986774283605264
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/tune_with_parameters_2021-11-18_10-44-47
Number of trials: 5/5 (1 RUNNING, 4 TERMINATED)
+----------------------------------+------------+-----------------+--------------+------+------+------------+---------+----------------------+
| Trial name                       | status     | loc             |   batch_size |   l1 |   l2 |         lr |    loss |   training_iteration |
|----------------------------------+------------+-----------------+--------------+------+------+------------+---------+----------------------|
| tune_with_parameters_8c63e_00004 | RUNNING

[2m[36m(pid=9168)[0m   self.x_data = (torch.from_numpy(x)/255.).type('torch.FloatTensor')


== Status ==
Current time: 2021-11-18 10:46:43 (running for 00:01:56.02)
Memory usage on this node: 5.7/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: -2.824658516620068 | Iter 4.000: -2.934492623890545 | Iter 2.000: -3.0742991034866227 | Iter 1.000: -3.986774283605264
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/tune_with_parameters_2021-11-18_10-44-47
Number of trials: 5/5 (1 RUNNING, 4 TERMINATED)
+----------------------------------+------------+-----------------+--------------+------+------+------------+---------+----------------------+
| Trial name                       | status     | loc             |   batch_size |   l1 |   l2 |         lr |    loss |   training_iteration |
|----------------------------------+------------+-----------------+--------------+------+------+------------+---------+----------------------|
| tune_with_parameters_8c63e_00004 | RUNNING

2021-11-18 10:47:03,606	INFO tune.py:630 -- Total run time: 136.50 seconds (136.34 seconds for the tuning loop).


Result for tune_with_parameters_8c63e_00004:
  date: 2021-11-18_10-47-03
  done: true
  experiment_id: 27de2ccae2924eb7a4482cddefa6a75d
  hostname: f3af5fe45eb5
  iterations_since_restore: 4
  loss: 3.0232863054207875
  node_ip: 172.28.0.2
  pid: 9168
  should_checkpoint: true
  time_since_restore: 24.310259103775024
  time_this_iter_s: 5.371323347091675
  time_total_s: 24.310259103775024
  timestamp: 1637232423
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: 8c63e_00004
  
[2m[36m(ImplicitFunc pid=9168)[0m 
[2m[36m(ImplicitFunc pid=9168)[0m [EPOCH: 3], 	Val Loss: 3.0233
[2m[36m(ImplicitFunc pid=9168)[0m 
== Status ==
Current time: 2021-11-18 10:47:03 (running for 00:02:16.36)
Memory usage on this node: 5.5/12.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 8.000: -2.824658516620068 | Iter 4.000: -3.0232863054207875 | Iter 2.000: -3.0665208978855865 | Iter 1.000: -3.5893895494176986
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/6.14 GiB heap, 0.0/3.07 G