In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import activations

from tokenizer.tokenizer import create_numerate_array

import matplotlib.pyplot as plt

In [None]:
trailers_csv = "/Volumes/Seagate/natasha-diploma/trailers.csv"
trailers_df = pd.read_csv(trailers_csv, index_col=None, header=0)
trailers_df.drop(trailers_df.columns[trailers_df.columns.str.contains('Unnamed',case = False)],axis = 1, inplace = True)
display(trailers_df.head())

In [None]:
sessions_csv = "/Volumes/Seagate/natasha-diploma/sessions.csv"
sessions_df = pd.read_csv(sessions_csv, index_col=None, header=0)
sessions_df.drop(sessions_df.columns[sessions_df.columns.str.contains('Unnamed',case = False)],axis = 1, inplace = True)
display(sessions_df)

In [None]:
movies_id = sessions_df.to_numpy()
copy = movies_id
np.random.shuffle(copy)
movies_id = np.concatenate((np.array(movies_id).flatten(), copy.flatten()))
                           
vocab = np.unique(movies_id)
vocab_len = len(vocab)

print(vocab_len)
print(movies_id.shape)

In [None]:
def getVideoInfo(info_df, mean=True):
    tokenized_hue = create_numerate_array(info_df['hue'].to_numpy())
    counts = np.argmax(np.bincount(tokenized_hue))
    
    scenes_array = [
        info_df['brightness'].to_numpy(),
        info_df['colorfulness'].to_numpy(),
        info_df['energy'].to_numpy(),
        info_df['tempo'].to_numpy(),
        info_df['amplitude'].to_numpy(),
        info_df['mfcc'].to_numpy()
    ]
    scenes_array = np.nan_to_num(scenes_array, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
    scaler = MinMaxScaler((-1, 1))
    scenes_array = scaler.fit_transform(scenes_array)
    
    if mean:
        new_scenes_array = []
        for i in range(len(scenes_array)):
            new_scenes_array.append(np.around(np.mean(scenes_array[i]), 3))
            
        scenes_array = new_scenes_array
        
    scenes_array.append(counts)
    
    return scenes_array

In [None]:
info = {}
scenes_array = []
del_array_name = []
info_folder = "/Volumes/Seagate/natasha-diploma/videoinfo"
for i in range(0, len(movies_id)):
    try:
        info_csv = info_folder + '/' + movies_id[i] + '.csv'
        info_df =  pd.read_csv(info_csv, index_col=None, header=0)
        info[movies_id[i]] = getVideoInfo(info_df.iloc[: , 4:])
        scenes_array.append(info[movies_id[i]])
    except:
        del_array_name.append(movies_id[i])

In [None]:
tokenized = create_numerate_array(movies_id)

In [None]:
print(np.array(scenes_array).shape)
print(np.array(tokenized).shape)

In [None]:
id_size = 1
id_embed_size = 7
vocab_size = 366
rnn_units = 366

In [None]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, id_size, id_embed_size, rnn_units):
        super().__init__(self)
        self.id_embedding = tf.keras.layers.Embedding(vocab_size, id_embed_size)
        self.normalization = tf.keras.layers.BatchNormalization()
        self.concat = tf.keras.layers.Concatenate()
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size, activation=activations.relu)

    def call(self, _input, states=None, return_state=False, training=False):
        ids = self.id_embedding(_input[0])
        features = self.normalization(_input[1])
        x = self.concat([ids, features])
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x
        
    def predict(self, x, history, batch_size=None, verbose=0, steps=None, callbacks=None, max_queue_size=10,
    workers=1, use_multiprocessing=False,):
        predictions = super().predict(x, batch_size, verbose, steps, callbacks, max_queue_size, workers, use_multiprocessing)
        predicted_values = []
        for i in range(len(predictions)):
            output = predictions[-1][0]
            mask = []
            for i in range(vocab_size):
                if i + 1 in history:
                    mask.append(1)
                else:
                    mask.append(0)
            masked_prediction = np.ma.array(output, mask=mask)
            predicted_values.append(np.argmax(masked_prediction) + 1)
            
        
        return predicted_values

In [None]:
train_tokenized = tokenized[:3000]
test_tokenized = tokenized[3000:4200]

train_scenes = scenes_array[:3000]
test_scenes = scenes_array[3000:4200]

ids_ds = tf.data.Dataset.from_tensor_slices(train_tokenized)
features_ds = tf.data.Dataset.from_tensor_slices(train_scenes)

ids_ds_test = tf.data.Dataset.from_tensor_slices(test_tokenized)
features_ds_test = tf.data.Dataset.from_tensor_slices(test_scenes)

ds_test = tf.data.Dataset.zip((ids_ds_test, features_ds_test))

ds = tf.data.Dataset.zip((ids_ds, features_ds))
seq_length = 10
examples_per_epoch = len(tokenized) // (seq_length + 1)

sequences = ds.batch(seq_length + 1, drop_remainder=True)
ids_seq = ids_ds.batch(seq_length + 1, drop_remainder=True)
features_seq = features_ds.batch(seq_length + 1, drop_remainder=True)
sequences_test = ds_test.batch(seq_length + 1, drop_remainder=True)

ds

In [None]:
def full_split_input_target(id_sequence, feature_sequence):
    input_text = (id_sequence[:-1], feature_sequence[:-1])
    target_text = (id_sequence[1:])
    return input_text, target_text

def split_input_target(seq):
    input_text = seq[:-1]
    target_text = seq[1:]
    return input_text, target_text

ds = sequences.map(full_split_input_target)
ids_ds = ids_seq.map(split_input_target)
features_ds = features_seq.map(split_input_target)
ds_test = sequences_test.map(full_split_input_target)

BATCH_SIZE = 32
TEST_BATCH_SIZE = 1
BUFFER_SIZE = 10000

ds = (
    ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))


ids_ds = (
    ids_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

features_ds = (
    features_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

ds_test = (
    ds_test
    .shuffle(BUFFER_SIZE)
    .batch(TEST_BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

ds

In [None]:
features_ds

In [None]:
model = MyModel(
    vocab_size=vocab_size,
    id_size=id_size,
    id_embed_size=id_embed_size,
    rnn_units=rnn_units)


In [None]:
for _input, _output in ds.take(1):
    example_batch_predictions = model(_input)
    print(example_batch_predictions, "# (batch_size, sequence_length, vocab_size)")
    sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
    sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

    print(sampled_indices)

In [None]:
tf.config.run_functions_eagerly(True)
model.summary()

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [None]:
EPOCHS = 100
history = model.fit(ds, validation_data=ds_test, epochs=EPOCHS)

In [None]:
ds_test

In [None]:
model.evaluate(ds_test, batch_size=TEST_BATCH_SIZE, return_dict=True)

In [None]:
for _input, _output in ds_test.take(1):
    values = model.predict(_input, _input[0].numpy())
    print(values, _output.numpy()[0][-1])

In [None]:
print(vocab[_output.numpy()[0][-1] - 1])
print(vocab[values[0] - 1])

In [None]:
def getVideoInfoByName(name):
    info_csv = info_folder + '/' + name + '.csv'
    info_df =  pd.read_csv(info_csv, index_col=None, header=0)
    return getVideoInfo(info_df.iloc[: , 4:])

In [None]:
true_info = getVideoInfoByName(vocab[_output.numpy()[0][-1] - 1])
output_info = getVideoInfoByName(vocab[values[0] - 1])

In [None]:
true_info = np.nan_to_num(true_info, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
output_info = np.nan_to_num(output_info, copy=False, nan=0.0, posinf=0.0, neginf=0.0)

In [None]:
for i in range(id_embed_size):
    print(true_info[i])
    print(output_info[i])

In [None]:
input_info = []
for i in range(10):
    print(i, getVideoInfoByName(vocab[_input[0].numpy()[0][i] - 1]))
    print("OUTPUT:", output_info)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Точность модели')
plt.ylabel('точность')
plt.xlabel('эпоха')
plt.legend(['тренировочные данные', 'тестовые данные'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Потери модели')
plt.ylabel('потери')
plt.xlabel('эпоха')
plt.legend(['тренировочные данные', 'тестовые данные'], loc='upper left')
plt.show()