In [1]:
import torch

In [2]:
torch.cuda.empty_cache()

In [3]:
torch.__version__

'2.0.1+cu118'

In [4]:
print("쿠다 가능 :{}".format(torch.cuda.is_available()))
print("현재 디바이스 :{}".format(torch.cuda.current_device()))
print("디바이스 갯수 :{}".format(torch.cuda.device_count()))
 
for idx in range(0, torch.cuda.device_count()):
    print("디바이스 :{}".format(torch.cuda.device(idx)))
    print("디바이스 이름 :{}".format(torch.cuda.get_device_name(idx)))

쿠다 가능 :True
현재 디바이스 :0
디바이스 갯수 :1
디바이스 :<torch.cuda.device object at 0x7f5993724310>
디바이스 이름 :NVIDIA GeForce RTX 4060


In [5]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=0)

In [6]:
import pandas as pd
import datasets
from PIL import Image

In [7]:
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [8]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [9]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [10]:
import evaluate

accuracy = evaluate.load("accuracy")

In [11]:
from datasets import load_metric

metric = load_metric('f1', average='micro')

import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    # print("EVAL PRED.predictions: ",eval_pred.predictions)
    # print(type(eval_pred.predictions))
    # print("EVAL PRED.label_ids: ",eval_pred.label_ids)
    # print(type(eval_pred.label_ids))
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids, average='micro')

  metric = load_metric('f1', average='micro')


In [12]:
# import numpy as np

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return accuracy.compute(predictions=predictions, references=labels)

In [13]:
def transforms(examples):
    img_paths = examples['dir']

    transformed_images = []
    for img_path in img_paths:
        img_file = Image.open(img_path)
        transformed_image = _transforms(img_file.convert("RGB"))
        transformed_images.append(transformed_image)
        
    examples["pixel_values"] = transformed_images
    del examples["dir"]
    return examples

# tag list

In [14]:
import os

def list_folders_in_directory(path):
    folder_list = []
    
    # 경로 내의 모든 항목을 가져옵니다.
    items = os.listdir(path)
    
    # 각 항목에 대해 폴더인지 확인하고 리스트에 추가합니다.
    for item in items:
        item_path = os.path.join(path, item)
        if os.path.isdir(item_path):
            folder_list.append(item)
    
    return folder_list

# 경로 지정
directory_path = '/mnt/c/Users/user/DATA/image_to_tag/train_imgs_result'

# 폴더 리스트 얻기
folders = list_folders_in_directory(directory_path)

folders


['advertising',
 'ambiental',
 'calm',
 'christmas',
 'cool',
 'dark',
 'dream',
 'energetic',
 'game',
 'groovy',
 'happy',
 'horror',
 'love',
 'movie',
 'retro',
 'sad',
 'space',
 'sport']

In [15]:
# 리스트를 DataFrame으로 변환
tag_list = pd.DataFrame(folders, columns=['label_name'])

tag_list

Unnamed: 0,label_name
0,advertising
1,ambiental
2,calm
3,christmas
4,cool
5,dark
6,dream
7,energetic
8,game
9,groovy


In [16]:
tag_list['label'] = range(0, len(tag_list) )
tag_list

Unnamed: 0,label_name,label
0,advertising,0
1,ambiental,1
2,calm,2
3,christmas,3
4,cool,4
5,dark,5
6,dream,6
7,energetic,7
8,game,8
9,groovy,9


In [17]:
# tag_list = pd.read_csv('/mnt/c/Users/user/image_to_tag/image_to_tag/data/tag_list_df.csv')
# tag_list.rename(columns = {'0' : 'label_name'}, inplace = True)
# tag_list

# train

In [18]:
train = datasets.load_from_disk("/mnt/c/Users/user/DATA/image_to_tag/train_imgs_datasets/train")
test = datasets.load_from_disk("/mnt/c/Users/user/DATA/image_to_tag/train_imgs_datasets/test")

In [19]:
from datasets import DatasetDict

data_image = DatasetDict({
    "train": train,
    "test": test
})

In [20]:
data_image

DatasetDict({
    train: Dataset({
        features: ['dir', 'label_name', 'label'],
        num_rows: 6022
    })
    test: Dataset({
        features: ['dir', 'label_name', 'label'],
        num_rows: 1506
    })
})

In [21]:
data_image = data_image.with_transform(transforms)
data_image

DatasetDict({
    train: Dataset({
        features: ['dir', 'label_name', 'label'],
        num_rows: 6022
    })
    test: Dataset({
        features: ['dir', 'label_name', 'label'],
        num_rows: 1506
    })
})

In [22]:
data_image['train']

Dataset({
    features: ['dir', 'label_name', 'label'],
    num_rows: 6022
})

In [23]:
labels = list(tag_list['label_name'].values)

In [24]:
label2id, id2label = dict(), dict()

In [25]:
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [26]:
id2label[str(3)]

'christmas'

In [27]:
data_image = data_image.remove_columns("label_name")
data_image

DatasetDict({
    train: Dataset({
        features: ['dir', 'label'],
        num_rows: 6022
    })
    test: Dataset({
        features: ['dir', 'label'],
        num_rows: 1506
    })
})

In [28]:
data_image['train'][1]['pixel_values']

tensor([[[-1.0000, -1.0000, -0.9922,  ..., -0.9922, -0.9922, -0.9922],
         [-0.9922, -0.9922, -1.0000,  ..., -0.9922, -0.9922, -0.9922],
         [-0.9922, -0.9922, -1.0000,  ..., -0.9922, -0.9922, -0.9843],
         ...,
         [-0.9922, -0.9922, -0.9922,  ..., -0.9373, -0.9922, -0.9922],
         [-0.9922, -0.9922, -0.9922,  ..., -1.0000, -0.9765, -0.9765],
         [-0.9922, -0.9922, -0.9922,  ..., -0.9922, -0.9922, -0.9843]],

        [[-1.0000, -1.0000, -0.9922,  ..., -0.9922, -0.9922, -0.9922],
         [-0.9922, -0.9922, -1.0000,  ..., -0.9922, -0.9922, -0.9922],
         [-0.9922, -0.9922, -1.0000,  ..., -0.9922, -0.9922, -0.9843],
         ...,
         [-0.9922, -0.9922, -0.9922,  ..., -0.9373, -0.9922, -0.9922],
         [-0.9922, -0.9922, -0.9922,  ..., -1.0000, -0.9765, -0.9765],
         [-0.9922, -0.9922, -0.9922,  ..., -0.9922, -0.9922, -0.9843]],

        [[-1.0000, -1.0000, -0.9922,  ..., -0.9922, -0.9922, -0.9922],
         [-0.9922, -0.9922, -1.0000,  ..., -0

In [29]:
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True


In [30]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
training_args = TrainingArguments(
    output_dir="./model/label_20_model",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.001,
    gradient_checkpointing=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=4,
    num_train_epochs=300,
    warmup_ratio=0.1,
    # fp16=True,
    # tf32=True,
    logging_steps=10,
    # load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    # no_cuda=True,
)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=data_image["train"],
    eval_dataset=data_image["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

  return F.conv2d(input, weight, bias, self.stride,


Epoch,Training Loss,Validation Loss,F1
1,1.5729,1.547606,0.62085
2,0.9622,1.03762,0.708499
3,0.6933,1.03243,0.698539
4,1.1018,1.18725,0.662683
5,0.7567,1.20246,0.65073
6,1.1314,1.377203,0.639442
7,1.036,1.415142,0.618858
8,0.8146,1.380981,0.602922
9,1.3422,1.51215,0.552457
10,1.2486,1.441022,0.573705




TrainOutput(global_step=451800, training_loss=1.936249941462909, metrics={'train_runtime': 68859.2904, 'train_samples_per_second': 26.236, 'train_steps_per_second': 6.561, 'total_flos': 1.4001710442307453e+20, 'train_loss': 1.936249941462909, 'epoch': 300.0})

In [1]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.33.0', '0.22.0')

In [2]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.33.0', '0.22.0')