In [31]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers, models, optimizers, callbacks
from sklearn.model_selection import train_test_split
import cv2
from tqdm import tqdm
import math

In [32]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [33]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Conv2D, DepthwiseConv2D, BatchNormalization, ReLU, Input, Add, Dense, TimeDistributed, GlobalAveragePooling2D, Lambda, Softmax, Dropout, Flatten
from tensorflow.keras.models import Model

In [34]:
class TemporalShift(Layer):
    def __init__(self, num_segments=8, fold_div=8, **kwargs):
        super().__init__(**kwargs)
        self.num_segments = num_segments
        self.fold_div = fold_div

    def call(self, x):
        # [B*T, H, W, C]
        bt = tf.shape(x)[0]
        h, w, c = tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3]
        batch = bt // self.num_segments
        # [B, T, H, W, C]
        x = tf.reshape(x, (batch, self.num_segments, h, w, c))

        fold = c // self.fold_div
        left = tf.concat([x[:, 1:, :, :, :fold],
                          tf.zeros_like(x[:, :1, :, :, :fold])], axis=1)
        right = tf.concat([tf.zeros_like(x[:, :1, :, :, fold:2*fold]),
                           x[:, :-1, :, :, fold:2*fold]], axis=1)
        rest = x[:, :, :, :, 2*fold:]
        x_shift = tf.concat([left, right, rest], axis=-1)

        # [B*T, H, W, C]
        return tf.reshape(x_shift, (bt, h, w, c))

    def compute_output_shape(self, input_shape):
        return input_shape

In [35]:
def insert_temporal_shift(mobilenet, num_segments=8, fold_div=8):

    raw_input = tf.keras.Input(shape=(num_segments, 224, 224, 3), name='input')
    flat_input = tf.keras.layers.Lambda(
        lambda z: tf.reshape(z, (-1, 224, 224, 3)), name='flatten_bt'
    )(raw_input)

    layer_outputs = {}
    layer_outputs[mobilenet.layers[0].name] = flat_input

    for layer in mobilenet.layers[1:]:
        inbound_nodes = layer._inbound_nodes
        if not inbound_nodes:
            continue

        inputs = []
        for node in inbound_nodes:
            input_tensors = node.input_tensors
            if not isinstance(input_tensors, (list, tuple)):
                input_tensors = [input_tensors]
            for pred in input_tensors:
                pred_name = pred._keras_history.layer.name
                inputs.append(layer_outputs[pred_name])

        x = inputs[0] if len(inputs) == 1 else inputs


        if isinstance(layer, tf.keras.layers.DepthwiseConv2D):
            x = TemporalShift(num_segments=num_segments, fold_div=fold_div)(x)

        out = layer(x)
        layer_outputs[layer.name] = out


    model = tf.keras.Model(inputs=raw_input, outputs=out, name="mobilenetv2_tsm")
    return model

In [36]:
annot = pd.read_csv("/home/jupyter/datasphere/project/annotations.tsv", sep="\t")

In [37]:
annot

Unnamed: 0,attachment_id,user_id,text,begin,end,height,width,train,length
0,df5b08f0-41d1-4572-889c-8b893e71069b,185bd3a81d9d618518d10abebf0d17a8,А,36,76,1920,1080,False,150
1,3d2b6a08-131d-40a9-9533-cf45cefb07fd,9a0784a99ea13a8b06cb103c2c8c7f0f,А,31,63,1920,1080,True,78
2,1915f996-71ed-44ae-9a34-ee1959919238,ca6b767f0cccf093ba737ae2fc4fec3d,А,25,81,1920,1080,True,98
3,bfb2d7ae-0f6c-4e20-b087-0965641d34ff,0ab4f8e463cdded2e59d6001f4e1b487,А,18,47,1080,1920,False,82
4,24936cc5-e5bb-43a3-96b8-2a9b4a6d157f,a95892dae1d320bd8b08cbca6a127cd8,А,6,33,1080,1920,False,40
...,...,...,...,...,...,...,...,...,...
3857,c6fd8cec-5984-4101-9616-acf180e89a81,08b8fc47e7a869751c13f64a922e6c15,Ю,44,173,1920,1080,True,236
3858,1f4cdeed-c5a1-46dc-9eaa-af40cf662d86,f15c4e5b9dfe541cf7e6c05733b579e2,Ю,10,96,1920,1080,True,97
3859,4208dbab-bea9-4d81-bbfb-7c7a028a4341,7faa41c4cf0a2602a23390de23574993,Я,80,163,1080,1920,True,197
3860,53f4f947-9f8b-4aba-ac8b-ebc2655d15ff,ff169f3bbab458308b88188be3657125,Я,19,110,1920,1080,True,200


In [38]:
annot.drop(columns=["user_id", "begin", "end", "height", "width", "length"], inplace=True)

In [39]:
label_map = {"А":1, "Б":2, "В":3, "Г":4, "Д":5, "Е":6, "Ё":7, "Ж":8, "З":9, "И":10, "Й":11, "К":12, "Л":13, "М":14, "Н":15, "О":16, "П":17, "Р":18, "С":19, "Т":20, "У":21, "Ф":22, "Х":23, "Ц":24, "Ч":25, "Ш":26, "Щ":27, "Ъ":28, "Ы":29, "Ь":30, "Э":31, "Ю":32, "Я":33, "no_event":0}

In [40]:
annot["text"] = annot["text"].apply(lambda x: label_map[x])

In [41]:
from sklearn.model_selection import train_test_split

train_df = annot[annot["train"] == True].drop("train", axis=1)

print(train_df.shape)

(3182, 2)


In [42]:
test_df = annot[annot["train"] == False].drop("train", axis=1)
test_df.shape

(680, 2)

### Создание датасета

In [43]:
NUM_FRAMES = 8
NUM_CLASSES = 34

In [44]:
ROOT_DIR = "/home/jupyter/datasphere/project/processed"

In [45]:
def sample_frame_paths(video_dir, num_frames=NUM_FRAMES):

    all_frames = sorted([
        os.path.join(video_dir, fname)
        for fname in os.listdir(video_dir)
        if fname.endswith(".jpg")
    ])
    total = len(all_frames)

    return [all_frames[i] for i in range(8)]


In [46]:
def load_and_preprocess(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)

    return tf.cast(img, tf.float32) / 255.0


In [47]:
def load_video(path, label):
        
    frame_paths = sample_frame_paths(path)

    frames = []
    for p in frame_paths:
        frames.append(load_and_preprocess(p))
    # frames.shape == (8, 224, 224, 3)
    frames = tf.stack(frames)
    return frames, label


In [48]:
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomZoom(0.1),
    layers.RandomRotation(0.1),
    #tf.keras.layers.RandomTranslation(height_factor=0.05, width_factor=0.05),
])

def augment_sequence(images, labels):
    batch_size = tf.shape(images)[0]
    num_frames = 8
    
    # (batch, 8, 224, 224, 3) -> (batch*8, 224, 224, 3)
    images_reshaped = tf.reshape(images, [-1, 224, 224, 3])
    
    augmented = data_augmentation(images_reshaped, training=True)
    
    augmented_sequence = tf.reshape(augmented, [batch_size, num_frames, 224, 224, 3])
    
    return augmented_sequence, labels

In [49]:
def video_generator(df):
    list_video_dirs = df['attachment_id'].apply(lambda x: os.path.join(ROOT_DIR, x)).tolist()
    list_labels = tf.one_hot(df['text'].astype(np.int32).tolist(), 34)
    for path, label in zip(list_video_dirs, list_labels):
        frames, _ = load_video(path, label)
        yield frames, label


def create_dataset(df, batch_size=8, augment=False, shuffle=False):

    dataset = tf.data.Dataset.from_generator(
        lambda: video_generator(df),
        output_signature=(
            tf.TensorSpec(shape=(8, 224, 224, 3)),
            tf.TensorSpec(shape=(34,), dtype=tf.float32)
        ))
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df), reshuffle_each_iteration=True)
    
    dataset = dataset.repeat().batch(batch_size)
    
    if augment:
        dataset = dataset.map(augment_sequence, num_parallel_calls=tf.data.AUTOTUNE)
    
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [50]:
batch_size = 16

train_ds = create_dataset(train_df, batch_size, augment=True, shuffle=True)
test_ds = create_dataset(test_df, batch_size)

In [51]:
train_ds

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 8, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 34), dtype=tf.float32, name=None))>

In [52]:
'''
for batch in train_ds.take(1):
    frames, labels = batch
    print("Frames shape:", frames.shape)
    print("Labels:", labels.shape)
'''

'\nfor batch in train_ds.take(1):\n    frames, labels = batch\n    print("Frames shape:", frames.shape)\n    print("Labels:", labels.shape)\n'

### Training the model

In [53]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [54]:
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy

In [55]:
num_segments = 8
num_classes = 34

base_model = tf.keras.applications.MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

tsm_model = insert_temporal_shift(base_model, num_segments=num_segments)

for layer in tsm_model.layers:
    layer.trainable = False

out = GlobalAveragePooling2D()(tsm_model.output)
out = Dropout(0.5)(out)

out = Flatten(name='flatten')(out)

logits = Dense(num_classes, use_bias=True, name='logits')(out)
logits = Dropout(0.3)(logits)

def unflatten(z):
        bt = tf.shape(z)[0]
        batch = bt // num_segments
        return tf.reshape(z, (batch, num_segments, num_classes))

logits = Lambda(unflatten, name='reshape_to_bt_c')(logits)

out = Softmax()(logits)

out = Lambda(lambda z: tf.reduce_mean(z, axis=1), name='segment_mean')(out)

final_model = tf.keras.Model(inputs=tsm_model.input, outputs=out)

# Тестовый пример
test_input = np.random.rand(8, 224, 224, 3).astype(np.float32)

In [56]:
for i, layer in enumerate(final_model.layers):
    print(f"{i:03d} | {'Trainable' if layer.trainable else 'Frozen':9} | {layer.name}")

000 | Frozen    | input
001 | Frozen    | flatten_bt
002 | Frozen    | Conv1
003 | Frozen    | bn_Conv1
004 | Frozen    | Conv1_relu
005 | Frozen    | temporal_shift_17
006 | Frozen    | expanded_conv_depthwise
007 | Frozen    | expanded_conv_depthwise_BN
008 | Frozen    | expanded_conv_depthwise_relu
009 | Frozen    | expanded_conv_project
010 | Frozen    | expanded_conv_project_BN
011 | Frozen    | block_1_expand
012 | Frozen    | block_1_expand_BN
013 | Frozen    | block_1_expand_relu
014 | Frozen    | block_1_pad
015 | Frozen    | temporal_shift_18
016 | Frozen    | block_1_depthwise
017 | Frozen    | block_1_depthwise_BN
018 | Frozen    | block_1_depthwise_relu
019 | Frozen    | block_1_project
020 | Frozen    | block_1_project_BN
021 | Frozen    | block_2_expand
022 | Frozen    | block_2_expand_BN
023 | Frozen    | block_2_expand_relu
024 | Frozen    | temporal_shift_19
025 | Frozen    | block_2_depthwise
026 | Frozen    | block_2_depthwise_BN
027 | Frozen    | block_2_depthwise_

In [57]:
initial_epochs = 5

optimizer = AdamW(learning_rate=1e-3, weight_decay=1e-5)
loss = CategoricalCrossentropy()
metrics = [tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.TopKCategoricalAccuracy(k=3, name='top-3-accuracy')]

final_model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

history_head = final_model.fit(
    train_ds,
    epochs=initial_epochs,
    steps_per_epoch = len(train_df) // batch_size,
    validation_steps = len(test_df) // batch_size,
    validation_data=test_ds
)


2025-05-10 15:21:02.670538: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_15' with dtype resource
	 [[{{node Placeholder/_15}}]]
2025-05-10 15:21:02.671160: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_15' with dtype resource
	 [[{{node Placeholder/_15}}]]


Epoch 1/5


2025-05-10 15:21:21.119915: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 420 of 3182
2025-05-10 15:21:31.134766: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 792 of 3182
2025-05-10 15:21:41.144148: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1164 of 3182
2025-05-10 15:21:51.119395: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1603 of 3182
2025-05-10 15:22:01.134406: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1972 of 3182
2025-05-10 15:22:11.120961: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2342 of 3182
2025-05-10 15:22:21.127591: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuff



2025-05-10 15:24:16.111640: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2025-05-10 15:24:16.112165: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [58]:
for layer in final_model.layers[91:]:
        layer.trainable = True


In [59]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    min_delta=0.001,
    verbose=1
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_mobilenet.h5", 
    monitor='val_loss',
    save_freq='epoch', 
    mode='min',
    save_best_only=True
)

tfboard = tf.keras.callbacks.TensorBoard(log_dir="./logs3", update_freq=20)

callbacks = [tfboard, checkpoint, early_stop]

In [60]:
epochs = 100

lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=1e-4,
    decay_steps = 80 * (len(train_df) // batch_size),
    alpha=1e-6
)

optimizer = AdamW(learning_rate=lr_schedule, weight_decay=1e-5)
loss = CategoricalCrossentropy()
metrics = [tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.TopKCategoricalAccuracy(k=3, name='top-3-accuracy')]

final_model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

history = final_model.fit(
    train_ds,
    epochs=epochs+initial_epochs,
    initial_epoch=initial_epochs,
    steps_per_epoch = len(train_df) // batch_size,
    validation_steps = len(test_df) //batch_size,
    validation_data=test_ds,
    callbacks=callbacks
)


Epoch 6/105


2025-05-10 15:32:40.498229: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 428 of 3182
2025-05-10 15:32:50.504835: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 844 of 3182
2025-05-10 15:33:00.495472: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1239 of 3182
2025-05-10 15:33:10.491676: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 1627 of 3182
2025-05-10 15:33:20.493608: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2031 of 3182
2025-05-10 15:33:30.491402: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 2429 of 3182
2025-05-10 15:33:40.503031: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuff

Epoch 7/105
Epoch 8/105
Epoch 9/105
Epoch 10/105
Epoch 11/105
Epoch 12/105
Epoch 13/105
Epoch 14/105
Epoch 15/105
Epoch 16/105
Epoch 17/105
Epoch 18/105
Epoch 19/105
Epoch 20/105
Epoch 21/105
Epoch 22/105
Epoch 23/105
Epoch 24/105
Epoch 25/105
Epoch 26/105
Epoch 27/105
Epoch 28/105
Epoch 29/105
Epoch 30/105
Epoch 31/105
Epoch 32/105
Epoch 33/105
Epoch 34/105
Epoch 35/105
Epoch 36/105
Epoch 37/105
Epoch 38/105
Epoch 39/105
Epoch 40/105
Epoch 41/105
Epoch 42/105
Epoch 43/105
Epoch 43: early stopping


### Saving model

In [61]:
final_model.save('mobile_tsm_bukva')

2025-05-10 16:42:37.068139: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:GPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): CANCELLED: RecvAsync is cancelled.
	 [[{{node sequential_1/random_rotation_1/stateful_uniform/Cast_1/_10}}]] [type.googleapis.com/tensorflow.DerivedStatus='']
2025-05-10 16:42:37.068279: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): CANCELLED: RecvAsync is cancelled.
	 [[{{node sequential_1/random_rotation_1/stateful_uniform/Cast_1/_10}}]]
	 [[sequential_1/random_rotation_1/stateful_uniform/RngReadAndSkip/_15]] [type.googleapis.com/tensorflow.DerivedStatus='']
2025-05-10 16:42:37.068363: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:GPU:0] (DEBUG INFO) Executor s

INFO:tensorflow:Assets written to: mobile_tsm_bukva/assets


INFO:tensorflow:Assets written to: mobile_tsm_bukva/assets


In [62]:
converter = tf.lite.TFLiteConverter.from_saved_model('mobile_tsm_bukva')
tflite_model = converter.convert()

# Сохранение модели
with open('mobilenet_tsm.tflite', 'wb') as f:
    f.write(tflite_model)

2025-05-10 16:43:41.168248: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2025-05-10 16:43:41.168304: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-05-10 16:43:41.172023: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: mobile_tsm_bukva
2025-05-10 16:43:41.230160: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2025-05-10 16:43:41.230216: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: mobile_tsm_bukva
2025-05-10 16:43:41.368976: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:353] MLIR V1 optimization pass is not enabled
2025-05-10 16:43:41.409861: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2025-05-10 16:43:42.360080: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: mobile_tsm_bukva
2025-05