# Video Summarizer 모델 학습/평가 코드

In [1]:
import math
import numpy as np

## 경로 설정

In [4]:
dataset_dir = r'E:\Work\YasuoNet\data\dataset14_sl5_vsr3_vw64_vh64_asr22050_mfcc'
ckpt_dir = 'checkpoints'

## 데이터 로더 생성

In [5]:
from data_loader import DataLoader

data_loader = DataLoader(dataset_dir, x_includes=['video', 'audio'])

data_config = data_loader.get_metadata()['config']
input_shape_dict = data_loader.get_metadata()['data_shape']
class_counts = data_loader.all_segment_df['label'].value_counts(sort=False)

## 하이퍼파라미터 설정

In [10]:
learning_rate = 1e-3
epochs = 50
batch_size = 500
class_weights = (1, 9)

In [11]:
class_weights = np.array(class_weights)
class_weight_dict = {c: class_weights[c] * class_counts.sum() / (class_weights * class_counts).sum() for c in range(data_loader.CLASS_COUNT)}
class_weight_dict

{0: 0.580425111220959, 1: 5.22382600098863}

## 모델 생성

In [14]:
from tensorflow.keras.layers import Dense, Dropout, Conv3D, Conv2D, Input, MaxPool3D, MaxPool2D, Flatten, Activation, concatenate
from tensorflow.keras.backend import expand_dims
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall

def build_model(input_shape_dict):
    video_input_shape = input_shape_dict['video']
    audio_input_shape = input_shape_dict['audio']
    weight_decay = 0.005

    # Video 3D Conv layers
    video_input = Input(video_input_shape)
    x = Conv3D(8, (3, 3, 3), strides=(1, 1, 1), padding='same', activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay))(video_input)
    x = MaxPool3D((2, 2, 2), strides=(2, 2, 2), padding='same')(x)
    video_output = Flatten()(x)

    # Audio 2D Conv layers
    audio_input = Input(audio_input_shape)
    x = expand_dims(audio_input)    # add channel dim
    x = Conv2D(4, (3, 3), strides=(1, 1), padding='same', activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay))(x)
    x = MaxPool2D((2, 2), strides=(2, 2), padding='same')(x)
    audio_output = Flatten()(x)

    # Fully-connected layers
    fc_input = concatenate([video_output, audio_output])
    x = Dense(16, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay))(fc_input)
    #     x = Dropout(0.2)(x)
    fc_output = Dense(1, activation='sigmoid', kernel_initializer='he_uniform', kernel_regularizer=l2(weight_decay))(x)

    model = Model(inputs=[video_input, audio_input], outputs=fc_output)

    return model

In [15]:
model = build_model(input_shape_dict)
model.compile(Adam(learning_rate), loss='binary_crossentropy', metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 40, 216)]    0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 15, 64, 64,  0                                            
__________________________________________________________________________________________________
tf_op_layer_ExpandDims_1 (Tenso [(None, 40, 216, 1)] 0           input_4[0][0]                    
__________________________________________________________________________________________________
conv3d_1 (Conv3D)               (None, 15, 64, 64, 8 656         input_3[0][0]                    
____________________________________________________________________________________________

## 모델 학습

In [16]:
from trainer import Trainer

# epoch 당 배치 수
train_steps = data_loader.get_train_data_count() // batch_size
valid_steps = math.ceil(data_loader.get_valid_data_count() / batch_size)
test_steps = math.ceil(data_loader.get_test_data_count() / batch_size)
train_steps, valid_steps, test_steps

# 학습 시작
trainer = Trainer(model, ckpt_dir, learning_rate, epochs, class_weight_dict)
trainer.train(
    data_loader.iter_train_batch_data(batch_size), train_steps,
    data_loader.iter_valid_batch_data(batch_size), valid_steps
)

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 14 steps, validate for 5 steps
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.97695, saving model to checkpoints\ckpt-20200813-110239-0001-0.9770.hdf5
Epoch 2/50
Epoch 00002: val_loss improved from 0.97695 to 0.95407, saving model to checkpoints\ckpt-20200813-110239-0002-0.9541.hdf5
Epoch 3/50
Epoch 00003: val_loss improved from 0.95407 to 0.92470, saving model to checkpoints\ckpt-20200813-110239-0003-0.9247.hdf5
Epoch 4/50
Epoch 00004: val_loss improved from 0.92470 to 0.90672, saving model to checkpoints\ckpt-20200813-110239-0004-0.9067.hdf5
Epoch 5/50

KeyError: 'val_loss'

In [7]:
trainer.test(data_loader.iter_test_batch_data(batch_size), test_steps)

  ...
    to  
  ['...']


In [8]:
test_batch_iter = data_loader.iter_test_batch_data(batch_size, repeat=False)

result = []
for x, y in test_batch_iter:
    pred = model.predict(x)
    y_hat = (pred > 0.5) * 1

    result.append(np.vstack([y.squeeze(), y_hat.squeeze()]).T)
    
result = np.vstack(result)

In [9]:
len(result)

2227

In [10]:
result.sum(axis=0)

array([244,   0])