In [3]:
import pandas as pd
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
) 
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup

import random
import os
import numpy as np

In [6]:
def seed(seed = 1234):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
if torch.cuda.is_available() == True:
    device = 'cuda:0'
    print('현재 가상환경 GPU 사용 가능상태')
else:
    device = 'cpu'
    print('GPU 사용 불가능 상태')

현재 가상환경 GPU 사용 가능상태


In [5]:
side_size = 256 #
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32 # 넣어야 하는 프레임(시퀀스) 개수인데 2의 배수로 해야 하는 듯?
sampling_rate = 1 
frames_per_second = 30 # 동영상 fps
slowfast_alpha = 4 # 무슨 가중치?
num_clips = 10
num_crops = 3

In [7]:
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
model.blocks[6].proj.out_features = 5 # 마지막 classifier 개수 바꿈
model.to(device)

Downloading: "https://github.com/facebookresearch/pytorchvideo/archive/main.zip" to C:\Users\nyan/.cache\torch\hub\main.zip
Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOWFAST_8x8_R50.pyth" to C:\Users\nyan/.cache\torch\hub\checkpoints\SLOWFAST_8x8_R50.pyth


  0%|          | 0.00/264M [00:00<?, ?B/s]

Net(
  (blocks): ModuleList(
    (0): MultiPathWayWithFuse(
      (multipathway_blocks): ModuleList(
        (0): ResNetBasicStem(
          (conv): Conv3d(3, 64, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
          (norm): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (activation): ReLU()
          (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
        )
        (1): ResNetBasicStem(
          (conv): Conv3d(3, 8, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
          (norm): BatchNorm3d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (activation): ReLU()
          (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
        )
      )
      (multipathway_fusion): FuseFastToSlow(
        (conv_fast_to_slow): Conv3d(8, 16, kernel_size=(7, 1, 1), st

In [8]:
# slowfast가 input 2개(동영상 원본, 띄엄 띄엄 저장한거 1개) 로 만들어주는 class
# 원본 영상 받으면 위 설명대로 2가지 경우로 만들어주는 class
class PackPathway(torch.nn.Module): 
    """
    Transform for converting video frames as a list of tensors. 
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

# 커스템 테이터셋 만듬(석현이 만듬)
class CustomDataset(Dataset):
    def __init__(self, file,device,transform=None, train=True):
        super().__init__()
        self.file = file
        self.len = len(self.file)
        self.device = device
        self.transform = transform
        self.train = train
        self.datalayer = PackPathway()
    
    def __getitem__(self, idx):
        if self.train :
            path = self.file[idx][0]
            label = self.file[idx][1]
            video = EncodedVideo.from_path(path)
            video_data = video.get_clip(start_sec=0, end_sec=1)
            video_data = self.transform(video_data)
            inputs = video_data["video"]
            inputs = [i.to(device) for i in inputs]
            return inputs, label
        else :
            path = self.file[idx]
            video = EncodedVideo.from_path(path)
            video_data = video.get_clip(start_sec=0, end_sec=1)
            video_data = self.transform(video_data)
            inputs = video_data["video"]
            inputs = [i.to(device) for i in inputs]
            return inputs
            
    def __len__(self):
        return self.len

In [None]:
# 전체 모름
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_data = []
test_data = []

for vid, path, label in train.values :
    train_data.append((path, label))

for vid, path in test.values :
    test_data.append(path)
    
train_dataset = CustomDataset(train_data,device,transform,train=True)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=False)

test_dataset = CustomDataset(test_data,device,transform,train=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

NameError: name 'transform' is not defined

In [None]:
# 전체 다 복붙 코드
param_optimizer = list(model.named_parameters()) # 모델의 파라미터를 받아온다
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(
        nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(
        nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters,
                    lr=5e-5, correct_bias=False)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(1,101) :
    total_loss = 0
    model.train()
    print("------------TRAIN------------")
    for i, d in enumerate(train_loader): 
        data, label = d
        label = label.to(device)
        optimizer.zero_grad()
        
        output = model(data)
        loss = criterion(output,label)
        
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
    print("EPOCH:", epoch)
    print("train_loss:{:.6f}".format(total_loss/len(train_loader)))