In [74]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Layer, Dense, GlobalAveragePooling2D, Lambda, Softmax, Dropout, Flatten, BatchNormalization

In [75]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


### Model Creation Functions

In [None]:
class TemporalShift(Layer):
    
    def __init__(self, num_segments=8, fold_div=8, **kwargs):
        super().__init__(**kwargs)
        self.num_segments = num_segments
        self.fold_div = fold_div

    def call(self, x):

        # [B*T, H, W, C]
        bt = tf.shape(x)[0]
        h, w, c = tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3]
        batch = bt // self.num_segments
        # [B, T, H, W, C]
        x = tf.reshape(x, (batch, self.num_segments, h, w, c))

        fold = c // self.fold_div
        
        left = tf.roll(x[:, :, :, :, :fold], shift=-1, axis=1)
        right = tf.roll(x[:, :, :, :, fold:2*fold], shift=1, axis=1)
        rest = x[:, :, :, :, 2 * fold:]
        
        x_shift = tf.concat([left, right, rest], axis=-1)
        
        # [B*T, H, W, C]
        return tf.reshape(x_shift, (bt, h, w, c))

    def compute_output_shape(self, input_shape):
        return input_shape

In [78]:
inverted_residual_blocks = [2, 4, 5, 7, 8, 9, 11, 12, 14, 15]

In [79]:
def insert_temporal_shift(mobilenet, num_segments=8, fold_div=8):

    raw_input = tf.keras.Input(shape=(num_segments, 224, 224, 3), name='input')
    flat_input = tf.keras.layers.Lambda(
        lambda z: tf.reshape(z, (-1, 224, 224, 3)), name='flatten_bt'
    )(raw_input)

    layer_outputs = {}
    layer_outputs[mobilenet.layers[0].name] = flat_input

    for layer in mobilenet.layers[1:]:
        inbound_nodes = layer._inbound_nodes
        if not inbound_nodes:
            continue

        inputs = []
        for node in inbound_nodes:
            input_tensors = node.input_tensors
            if not isinstance(input_tensors, (list, tuple)):
                input_tensors = [input_tensors]
            for pred in input_tensors:
                pred_name = pred._keras_history.layer.name
                inputs.append(layer_outputs[pred_name])

        x = inputs[0] if len(inputs) == 1 else inputs

        if isinstance(layer, tf.keras.layers.DepthwiseConv2D) and layer.name.startswith("block"):
            if int(layer.name[6:-10]) in inverted_residual_blocks:
                x = TemporalShift(num_segments=num_segments, fold_div=fold_div)(x)
                
        out = layer(x)
        layer_outputs[layer.name] = out


    model = tf.keras.Model(inputs=raw_input, outputs=out, name="mobilenetv2_tsm")
    return model

### Importing Required Files

In [80]:
annot = pd.read_csv("/home/jupyter/datasphere/project/annotations.tsv", sep="\t")

In [81]:
annot

Unnamed: 0,attachment_id,user_id,text,begin,end,height,width,train,length
0,df5b08f0-41d1-4572-889c-8b893e71069b,185bd3a81d9d618518d10abebf0d17a8,А,36,76,1920,1080,False,150
1,3d2b6a08-131d-40a9-9533-cf45cefb07fd,9a0784a99ea13a8b06cb103c2c8c7f0f,А,31,63,1920,1080,True,78
2,1915f996-71ed-44ae-9a34-ee1959919238,ca6b767f0cccf093ba737ae2fc4fec3d,А,25,81,1920,1080,True,98
3,bfb2d7ae-0f6c-4e20-b087-0965641d34ff,0ab4f8e463cdded2e59d6001f4e1b487,А,18,47,1080,1920,False,82
4,24936cc5-e5bb-43a3-96b8-2a9b4a6d157f,a95892dae1d320bd8b08cbca6a127cd8,А,6,33,1080,1920,False,40
...,...,...,...,...,...,...,...,...,...
3857,c6fd8cec-5984-4101-9616-acf180e89a81,08b8fc47e7a869751c13f64a922e6c15,Ю,44,173,1920,1080,True,236
3858,1f4cdeed-c5a1-46dc-9eaa-af40cf662d86,f15c4e5b9dfe541cf7e6c05733b579e2,Ю,10,96,1920,1080,True,97
3859,4208dbab-bea9-4d81-bbfb-7c7a028a4341,7faa41c4cf0a2602a23390de23574993,Я,80,163,1080,1920,True,197
3860,53f4f947-9f8b-4aba-ac8b-ebc2655d15ff,ff169f3bbab458308b88188be3657125,Я,19,110,1920,1080,True,200


In [82]:
annot.drop(columns=["user_id", "begin", "end", "height", "width", "length"], inplace=False)

Unnamed: 0,attachment_id,text,train
0,df5b08f0-41d1-4572-889c-8b893e71069b,А,False
1,3d2b6a08-131d-40a9-9533-cf45cefb07fd,А,True
2,1915f996-71ed-44ae-9a34-ee1959919238,А,True
3,bfb2d7ae-0f6c-4e20-b087-0965641d34ff,А,False
4,24936cc5-e5bb-43a3-96b8-2a9b4a6d157f,А,False
...,...,...,...
3857,c6fd8cec-5984-4101-9616-acf180e89a81,Ю,True
3858,1f4cdeed-c5a1-46dc-9eaa-af40cf662d86,Ю,True
3859,4208dbab-bea9-4d81-bbfb-7c7a028a4341,Я,True
3860,53f4f947-9f8b-4aba-ac8b-ebc2655d15ff,Я,True


In [83]:
label_map = {"no_event":0, "А":1, "Б":2, "В":3, "Г":4, "Д":5, "Е":6, "Ё":7, "Ж":8, "З":9, "И":10, "Й":11, "К":12, "Л":13, "М":14, "Н":15, "О":16, "П":17, "Р":18, "С":19, "Т":20, "У":21, "Ф":22, "Х":23, "Ц":24, "Ч":25, "Ш":26, "Щ":27, "Ъ":28, "Ы":29, "Ь":30, "Э":31, "Ю":32, "Я":33}

In [84]:
annot["text"] = annot["text"].apply(lambda x: label_map[x])

In [85]:
train_df = annot[annot["train"] == True].drop("train", axis=1)

print(train_df.shape)

(3182, 8)


In [86]:
test_df = annot[annot["train"] == False].drop("train", axis=1)
test_df.shape

(680, 8)

### Dataset Creation

In [87]:
NUM_FRAMES = 8
NUM_CLASSES = 34

In [88]:
ROOT_DIR = "/home/jupyter/datasphere/project/processed"

In [89]:
def sample_frame_paths(video_dir, num_frames=8):

    all_frames = sorted([
        os.path.join(video_dir, fname)
        for fname in os.listdir(video_dir)
        if fname.endswith(".jpg")
    ])
    total = len(all_frames)

    boundaries = np.linspace(0, total, num_frames + 1, dtype=np.int32)
    
    selected_indices = []
    
    for i in range(num_frames):
        start = boundaries[i]
        end = boundaries[i + 1]
        if end - start > 0:
            idx = np.random.randint(start, end)
        else:
            idx = start
        selected_indices.append(idx)
    
    return [all_frames[i] for i in selected_indices]

In [None]:
def load_and_preprocess(path):
    
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)

    return tf.cast(img, tf.float32) / 255.0


In [91]:
def load_video(path, label):
        
    frame_paths = sample_frame_paths(path)

    frames = []
    for p in frame_paths:
        frames.append(load_and_preprocess(p))
    # frames.shape == (8, 224, 224, 3)
    frames = tf.stack(frames)
    return frames, label


In [None]:
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomZoom(-0.2),
    layers.RandomRotation(0.1),
    #tf.keras.layers.RandomTranslation(height_factor=0.05, width_factor=0.05),
])

def augment_sequence(images, labels):
    
    batch_size = tf.shape(images)[0]
    num_frames = 8
    
    # (batch, 8, 224, 224, 3) -> (batch*8, 224, 224, 3)
    images_reshaped = tf.reshape(images, [-1, 224, 224, 3])
    
    augmented = data_augmentation(images_reshaped, training=True)
    
    augmented_sequence = tf.reshape(augmented, [batch_size, num_frames, 224, 224, 3])
    
    return augmented_sequence, labels

In [93]:
def video_generator(df):
    list_video_dirs = df['attachment_id'].apply(lambda x: os.path.join(ROOT_DIR, x)).tolist()
    list_labels = tf.one_hot(df['text'].astype(np.int32).tolist(), 34)
    for path, label in zip(list_video_dirs, list_labels):
        frames, _ = load_video(path, label)
        yield frames, label


def create_dataset(df, batch_size=8, augment=False, shuffle=False):

    dataset = tf.data.Dataset.from_generator(
        lambda: video_generator(df),
        output_signature=(
            tf.TensorSpec(shape=(8, 224, 224, 3)),
            tf.TensorSpec(shape=(34,), dtype=tf.float32)
        ))
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df), reshuffle_each_iteration=True)
    
    dataset = dataset.repeat().batch(batch_size)
    
    if augment:
        dataset = dataset.map(augment_sequence, num_parallel_calls=tf.data.AUTOTUNE)
    
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [94]:
batch_size = 16

train_ds = create_dataset(train_df, batch_size, augment=True, shuffle=True)
test_ds = create_dataset(test_df, batch_size)

In [95]:
'''
for batch in train_ds.take(1):
    frames, labels = batch
    print("Frames shape:", frames.shape)
    print("Labels:", labels.shape)
'''

'\nfor batch in train_ds.take(1):\n    frames, labels = batch\n    print("Frames shape:", frames.shape)\n    print("Labels:", labels.shape)\n'

### Training the model

In [96]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [97]:
from tensorflow.keras.optimizers import SGD, AdamW, Adamax
from tensorflow.keras.losses import CategoricalCrossentropy

In [98]:
num_segments = 8
num_classes = 34

base_model = tf.keras.applications.MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

tsm_model = insert_temporal_shift(base_model, num_segments=num_segments)

for layer in tsm_model.layers:
    layer.trainable = False

out = GlobalAveragePooling2D()(tsm_model.output)
out = Dropout(0.6)(out)

out = Flatten(name='flatten')(out)

logits = Dense(num_classes, use_bias=True, name='logits')(out)

def unflatten(z):
        bt = tf.shape(z)[0]
        batch = bt // num_segments
        return tf.reshape(z, (batch, num_segments, num_classes))

logits = Lambda(unflatten, name='reshape_to_bt_c')(logits)

out = Softmax()(logits)

out = Lambda(lambda z: tf.reduce_mean(z, axis=1), name='segment_mean')(out)

final_model = tf.keras.Model(inputs=tsm_model.input, outputs=out)

# Тестовый пример
test_input = np.random.rand(8, 224, 224, 3).astype(np.float32)
test_output = final_model.predict(np.expand_dims(test_input, axis=0))
print("Output shape:", test_output.shape)
#print("Dataset labels shape:", labels.shape)

Output shape: (1, 34)


In [99]:
for i, layer in enumerate(final_model.layers):
    print(f"{i:03d} | {'Trainable' if layer.trainable else 'Frozen':9} | {layer.name}")

000 | Frozen    | input
001 | Frozen    | flatten_bt
002 | Frozen    | Conv1
003 | Frozen    | bn_Conv1
004 | Frozen    | Conv1_relu
005 | Frozen    | expanded_conv_depthwise
006 | Frozen    | expanded_conv_depthwise_BN
007 | Frozen    | expanded_conv_depthwise_relu
008 | Frozen    | expanded_conv_project
009 | Frozen    | expanded_conv_project_BN
010 | Frozen    | block_1_expand
011 | Frozen    | block_1_expand_BN
012 | Frozen    | block_1_expand_relu
013 | Frozen    | block_1_pad
014 | Frozen    | block_1_depthwise
015 | Frozen    | block_1_depthwise_BN
016 | Frozen    | block_1_depthwise_relu
017 | Frozen    | block_1_project
018 | Frozen    | block_1_project_BN
019 | Frozen    | block_2_expand
020 | Frozen    | block_2_expand_BN
021 | Frozen    | block_2_expand_relu
022 | Frozen    | temporal_shift_10
023 | Frozen    | block_2_depthwise
024 | Frozen    | block_2_depthwise_BN
025 | Frozen    | block_2_depthwise_relu
026 | Frozen    | block_2_project
027 | Frozen    | block_2_project

In [100]:
#tf.keras.utils.plot_model(final_model)

In [101]:
epochs = 10


optimizer = AdamW(learning_rate=3e-4, weight_decay=1e-5)
loss = CategoricalCrossentropy()
metrics = [tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.TopKCategoricalAccuracy(k=3, name='top-3-accuracy')]

final_model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

history = final_model.fit(
    train_ds,
    epochs=epochs,
    steps_per_epoch = len(train_df) // batch_size,
    validation_steps = len(test_df) //batch_size,
    validation_data=test_ds
)


2025-05-30 12:44:43.133344: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_13' with dtype resource
	 [[{{node Placeholder/_13}}]]
2025-05-30 12:44:43.133731: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_13' with dtype resource
	 [[{{node Placeholder/_13}}]]


Epoch 1/10


2025-05-30 12:44:57.616874: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 538 of 3182
2025-05-30 12:45:07.594178: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 951 of 3182
2025-05-30 12:45:17.611210: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1367 of 3182
2025-05-30 12:45:27.613025: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1780 of 3182
2025-05-30 12:45:37.605938: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2187 of 3182
2025-05-30 12:45:47.611115: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2590 of 3182
2025-05-30 12:45:57.603927: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuff



2025-05-30 12:49:08.550953: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2025-05-30 12:49:08.551219: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [102]:
for i, layer in enumerate(final_model.layers[115:]):
    if isinstance(layer, BatchNormalization):
        layer.trainable = False
    else:
        layer.trainable = True

In [103]:
for i, layer in enumerate(final_model.layers):
    print(f"{i:03d} | {'Trainable' if layer.trainable else 'Frozen':9} | {layer.name}")

000 | Frozen    | input
001 | Frozen    | flatten_bt
002 | Frozen    | Conv1
003 | Frozen    | bn_Conv1
004 | Frozen    | Conv1_relu
005 | Frozen    | expanded_conv_depthwise
006 | Frozen    | expanded_conv_depthwise_BN
007 | Frozen    | expanded_conv_depthwise_relu
008 | Frozen    | expanded_conv_project
009 | Frozen    | expanded_conv_project_BN
010 | Frozen    | block_1_expand
011 | Frozen    | block_1_expand_BN
012 | Frozen    | block_1_expand_relu
013 | Frozen    | block_1_pad
014 | Frozen    | block_1_depthwise
015 | Frozen    | block_1_depthwise_BN
016 | Frozen    | block_1_depthwise_relu
017 | Frozen    | block_1_project
018 | Frozen    | block_1_project_BN
019 | Frozen    | block_2_expand
020 | Frozen    | block_2_expand_BN
021 | Frozen    | block_2_expand_relu
022 | Frozen    | temporal_shift_10
023 | Frozen    | block_2_depthwise
024 | Frozen    | block_2_depthwise_BN
025 | Frozen    | block_2_depthwise_relu
026 | Frozen    | block_2_project
027 | Frozen    | block_2_project

In [104]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True,
    min_delta=0.001,
    verbose=1
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_mobilenet2.h5", 
    monitor='val_loss',
    save_freq='epoch', 
    mode='min',
    save_best_only=True
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    verbose=1,
    min_lr=0.0,
)


tfboard = tf.keras.callbacks.TensorBoard(log_dir="./logs2", update_freq=20)

callbacks = [tfboard, checkpoint, early_stop]

In [105]:
epochs = 100

lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=3e-4,
    decay_steps = 30 * (len(train_df) // batch_size)
)


optimizer = AdamW(learning_rate=lr_schedule, weight_decay=3e-4)
loss = CategoricalCrossentropy()
metrics = [tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.TopKCategoricalAccuracy(k=3, name='top-3-accuracy')]

final_model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

history = final_model.fit(
    train_ds,
    epochs=epochs,
    steps_per_epoch = len(train_df) // batch_size,
    validation_steps = len(test_df) //batch_size,
    validation_data=test_ds,
    callbacks=callbacks
)


Epoch 1/100


2025-05-30 13:23:20.073934: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 556 of 3182
2025-05-30 13:23:30.083520: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1115 of 3182
2025-05-30 13:23:40.082988: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1672 of 3182
2025-05-30 13:23:50.084567: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2241 of 3182
2025-05-30 13:24:00.081541: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2805 of 3182
2025-05-30 13:24:07.028115: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

KeyboardInterrupt: 

### Saving model

In [None]:
final_model.save('mobile_tsm_bukva2')