In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

dataset_path = os.listdir('dataset/train')

label_types = os.listdir('dataset/train')
print(label_types)

['가볍다', '가져오다', '가짜', '가치', '보관', '보내다', '보다', '안경', '알다', '월요일']


### 학습데이터 준비

In [2]:
rooms = []
for item in dataset_path:
    # 모든 파일 이름 가져오기
    all_rooms = os.listdir('dataset/train'+'/'+item)
    
    # 리스트에 더하기
    for room in all_rooms:
        rooms.append((item, str('dataset/train'+'/'+item)+'/'+room))

# Build a dataframe
train_df = pd.DataFrame(data=rooms, columns=['tag','video_name'])
train_df

Unnamed: 0,tag,video_name
0,가볍다,dataset/train/가볍다/1.mp4
1,가져오다,dataset/train/가져오다/2.mp4
2,가짜,dataset/train/가짜/3.mp4
3,가치,dataset/train/가치/4.mp4
4,보관,dataset/train/보관/5.mp4
5,보내다,dataset/train/보내다/6.mp4
6,보다,dataset/train/보다/7.mp4
7,안경,dataset/train/안경/8.mp4
8,알다,dataset/train/알다/9.mp4
9,월요일,dataset/train/월요일/10.mp4


In [3]:
df = train_df.loc[:,['video_name','tag']]
df.to_csv('train.csv', encoding='utf-8-sig')

### 테스트 데이터 준비

In [4]:
dataset_path = os.listdir('dataset/test')

room_types = os.listdir('dataset/test')

rooms = []
for item in dataset_path:
    # 모든 파일 이름 가져오기
    all_rooms = os.listdir('dataset/test'+'/'+item)
    
    # 리스트에 더하기
    for room in all_rooms:
        rooms.append((item, str('dataset/test'+'/'+item)+'/'+room))

# Build a dataframe
train_df = pd.DataFrame(data=rooms, columns=['tag','video_name'])

df = train_df.loc[:,['video_name','tag']]
df.to_csv('test.csv', encoding='utf-8-sig')

In [5]:
# !pip install git+https://github.com/tensorflow/docs

In [6]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

In [7]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0], 
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)]
        )
    except RuntimeError as e:
        print(e)


### Data preparation

In [8]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total video for training: {len(train_df)}")
print(f"Total video for testing: {len(test_df)}")

train_df.sample(5)

Total video for training: 10
Total video for testing: 10


Unnamed: 0.1,Unnamed: 0,video_name,tag
1,1,dataset/train/가져오다/2.mp4,가져오다
5,5,dataset/train/보내다/6.mp4,보내다
7,7,dataset/train/안경/8.mp4,안경
6,6,dataset/train/보다/7.mp4,보다
8,8,dataset/train/알다/9.mp4,알다


### Feed the video to a network:

In [9]:
IMG_SIZE = 224

def crop_center_square(frame):
    y,x = frame.shape[0:2]
    min_dim = min(y,x)
    start_x = (x//2)-(min_dim//2)
    start_y = (y//2)-(min_dim//2)
    return frame[start_y : start_y +min_dim, start_x:start_x+min_dim]

def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)  
            frame = cv2.resize(frame, resize)
            frame = frame[:,:,[2,1,0]]
            frames.append(frame)
            
            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

### 특징 추출

In [10]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),)
    preprocess_input = keras.applications.inception_v3.preprocess_input
    
    inputs = keras.Input((IMG_SIZE,IMG_SIZE,3))
    preprocessed = preprocess_input(inputs)
    
    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = build_feature_extractor()

### Label Encoding
- StringLookup layer encode the class labels as integers

In [11]:
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"]))
print(label_processor.get_vocabulary())

labels = train_df["tag"].values
labels = label_processor(labels[...,None]).numpy()
labels

['가볍다', '가져오다', '가짜', '가치', '보관', '보내다', '보다', '안경', '알다', '월요일']


array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]], dtype=int64)

In [12]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 100

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [13]:
def prepare_all_video(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    
    labels = df["tag"].values
    
    labels = label_processor(labels[...,None]).numpy()
    
    frame_masks = np.zeros(shape=(num_samples,MAX_SEQ_LENGTH), dtype="bool")
    frame_fetures = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES),dtype="float32")
    
    for idx, path in enumerate(video_paths):
        frames = load_video(os.path.join(root_dir,path))
        frames = frames[None, ...]
        
        temp_frame_mask = np.zeros(shape=(1,MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_fetures = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES),dtype="float32")
        
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_fetures[i,j,:] = feature_extractor.predict(
                    batch[None,j,:]
                )
            temp_frame_mask[i,:length] = 1
            
        frame_fetures[idx,] = temp_frame_fetures.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()
    
    return (frame_fetures, frame_masks), labels

train_data, train_labels = prepare_all_video(train_df, "train")
test_data, test_labels = prepare_all_video(test_df,"test")

print(f"Frame feature in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

print(f"train_labels in train set:{train_labels.shape}")
print(f"test_labels in train set:{test_labels.shape}")

Frame feature in train set: (10, 20, 2048)
Frame masks in train set: (10, 20)
train_labels in train set:(10, 1)
test_labels in train set:(10, 1)


### The sequence model

In [14]:
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()
    
    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")
    
    x = keras.layers.GRU(16, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab),activation="softmax")(x)
    
    rnn_model = keras.Model([frame_features_input, mask_input], output)
    
    rnn_model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
    
    return rnn_model

EPOCHS = 30

def run_experiment():

    filepath = "./tmp/video_classifier.h5"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1)
    
    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )
    
    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy:{round(accuracy*100,2)}%")
    
    return history, seq_model

_, sequence_model = run_experiment()

Epoch 1/30
Epoch 1: val_loss improved from inf to 2.30399, saving model to ./tmp\video_classifier.h5
Epoch 2/30
Epoch 2: val_loss did not improve from 2.30399
Epoch 3/30
Epoch 3: val_loss did not improve from 2.30399
Epoch 4/30
Epoch 4: val_loss did not improve from 2.30399
Epoch 5/30
Epoch 5: val_loss did not improve from 2.30399
Epoch 6/30
Epoch 6: val_loss did not improve from 2.30399
Epoch 7/30
Epoch 7: val_loss did not improve from 2.30399
Epoch 8/30
Epoch 8: val_loss did not improve from 2.30399
Epoch 9/30
Epoch 9: val_loss did not improve from 2.30399
Epoch 10/30
Epoch 10: val_loss did not improve from 2.30399
Epoch 11/30
Epoch 11: val_loss did not improve from 2.30399
Epoch 12/30
Epoch 12: val_loss did not improve from 2.30399
Epoch 13/30
Epoch 13: val_loss did not improve from 2.30399
Epoch 14/30
Epoch 14: val_loss did not improve from 2.30399
Epoch 15/30
Epoch 15: val_loss did not improve from 2.30399
Epoch 16/30
Epoch 16: val_loss did not improve from 2.30399
Epoch 17/30
Epo

Test accuracy:10.0%


### Inference

In [17]:
def prepare_single_video(frames):
    frames = frames[None,...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
    
    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i,j,:] = feature_extractor.predict(batch[None,j,:])
        frame_mask[i,:length]=1
    return frame_features, frame_mask

def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()
    
    frames = load_video(os.path.join("test", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]
    
    for i in np.argsort(probabilities)[::-1]:
        print(f"{class_vocab[i]} : {probabilities[i]*100:5.2f}%")
    return frames
    
test_video = np.random.choice(test_df["video_name"].values.tolist())
print(f"Test video path : {test_video}")

test_frames = sequence_prediction(test_video)

Test video path : dataset/test/월요일/10.mp4
보다 : 10.01%
보내다 : 10.01%
보관 : 10.01%
가치 : 10.01%
가짜 : 10.01%
가져오다 : 10.01%
가볍다 : 10.01%
안경 :  9.99%
월요일 :  9.99%
알다 :  9.99%
