In [1]:
!pip install pandas
!pip install numpy
!pip install tensorflow



In [67]:
import pandas as pd
import os
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense, LSTM, Bidirectional

In [3]:
# Read all the data
labels = []
raw_inputs = []
df = pd.read_csv('./raw_data.csv')
max_frames = 0
max_words = 0
max_sequence_length = 0

for index, row in df.iterrows():
    path = os.path.join('./processed_video', 'audio_' + row['video_id'] + '_txt')
    
    if os.path.isfile(path):
        with open(path, 'r') as f:
            try:
                raw_inputs_video = []
                sequence_length = 0
                file_data = json.load(f)
                num_frames = len(file_data)
                
                if num_frames > max_frames:
                    max_frames = num_frames

                for i in range(num_frames):
                    line_data = json.loads(file_data[i])

                    if 'DisplayText' in line_data.keys():
                        sentence = line_data['DisplayText']
                        num_words = len(sentence.split())
                        sequence_length = sequence_length + num_words

                        if num_words > max_words:
                            max_words = num_words
                        raw_inputs_video.append(sentence)

                    del line_data
                
                raw_inputs.append(raw_inputs_video)
                labels.append((row['video_likeCount'] - row['video_dislikeCount'])/row['video_viewCount'])
                
                if sequence_length > max_sequence_length:
                    max_sequence_length = sequence_length

                del raw_inputs_video, file_data, sequence_length
            except ValueError:
                print(path + ' failed processing')

raw_inputs = np.array(raw_inputs)
labels = np.array(labels)
print()
print('Shape of raw_inputs: ', raw_inputs.shape)
print('Shape of labels: ', labels.shape)
print('Max Words: ', max_words)
print('Max Frames: ', max_frames)
print('Max Sequence Lenghth: ', max_sequence_length)
np.save('./text.npy', raw_inputs)

./processed_video/audio_xCgk9nvuCxk_txt failed processing
./processed_video/audio_Q_ouhkdo-ko_txt failed processing

Shape of raw_inputs:  (3389,)
Shape of labels:  (3389,)
Max Words:  172
Max Frames:  186
Max Sequence Lenghth:  3453


In [4]:
label_series = pd.Series(labels)
print(label_series.value_counts(bins=20).sort_index())

(-0.021500000000000002, -0.0122]       1
(-0.0122, -0.00398]                    7
(-0.00398, 0.00421]                  225
(0.00421, 0.0124]                   1094
(0.0124, 0.0206]                    1021
(0.0206, 0.0288]                     613
(0.0288, 0.037]                      264
(0.037, 0.0452]                      103
(0.0452, 0.0533]                      31
(0.0533, 0.0615]                      17
(0.0615, 0.0697]                       5
(0.0697, 0.0779]                       2
(0.0779, 0.0861]                       1
(0.0861, 0.0943]                       1
(0.0943, 0.102]                        0
(0.102, 0.111]                         1
(0.111, 0.119]                         1
(0.119, 0.127]                         0
(0.127, 0.135]                         1
(0.135, 0.143]                         1
dtype: int64


In [5]:
# Prepare data for cnn lstm by concatenating frames, normalizing labels and defining constants
text_inputs = [''.join(row) for row in raw_inputs]
labels = np.clip(labels, 0.00421, 0.037)
min_label = np.min(labels)
max_label = np.max(labels)
scaled_labels = (labels - min_label)/(max_label - min_label)
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = max_sequence_length
VALIDATION_SPLIT = 0.3
GLOVE_DIR = './glove.twitter.27B'
EMBEDDING_DIM = 200

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text_inputs)
sequences = tokenizer.texts_to_sequences(text_inputs)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', scaled_labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = scaled_labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Shape of training tensor:', x_train.shape)
print('Shape of training labels:', y_train.shape)
print('Shape of testing tensor:', x_val.shape)
print('Shape of testing labels:', y_val.shape)

Found 38587 unique tokens.
Shape of data tensor: (3389, 3453)
Shape of label tensor: (3389,)
Shape of training tensor: (2373, 3453)
Shape of training labels: (2373,)
Shape of testing tensor: (1016, 3453)
Shape of testing labels: (1016,)


In [7]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [8]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [9]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [71]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu', kernel_regularizer=l1_l2(l1=1.e-12, l2=1.e-12))(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu', kernel_regularizer=l1_l2(l1=1.e-12, l2=1.e-12))(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu', kernel_regularizer=l1_l2(l1=1.e-12, l2=1.e-12))(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Bidirectional(LSTM(max_sequence_length, activation='relu', kernel_regularizer=l1_l2(l1=1.e-12, l2=1.e-12), recurrent_regularizer=l1_l2(l1=1.e-12, l2=1.e-12), dropout=1.e-2))(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(1, activation='linear')(x)

model = Model(sequence_input, preds)
model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=[tf.keras.metrics.RootMeanSquaredError()])

print(model.summary())

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=10, batch_size=128)

Model: "model_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_24 (InputLayer)        [(None, 3453)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 3453, 200)         7717600   
_________________________________________________________________
conv1d_64 (Conv1D)           (None, 3449, 128)         128128    
_________________________________________________________________
max_pooling1d_63 (MaxPooling (None, 689, 128)          0         
_________________________________________________________________
conv1d_65 (Conv1D)           (None, 685, 128)          82048     
_________________________________________________________________
max_pooling1d_64 (MaxPooling (None, 137, 128)          0         
_________________________________________________________________
conv1d_66 (Conv1D)           (None, 133, 128)          820

<tensorflow.python.keras.callbacks.History at 0x144c20e50>

In [63]:
predictions = model.predict(x_val)
mse = tf.keras.losses.MeanSquaredError()
loss = mse(y_val, predictions)
print('Loss on validation set is %s' % loss.numpy())

Loss on validation set is 0.08382937


In [64]:
# More fine grained accuracy calculations
huge_score = np.squeeze(np.argwhere(y_val > 0.7))
print('Data points with score more than 0.7 %s:' % len(huge_score))
val_data = x_val[huge_score,:]
y_true = y_val[huge_score]
predictions = model.predict(val_data)
loss = mse(y_true, predictions)
print('Loss on validation set is %s' % loss.numpy())

Data points with score more than 0.7 153:
Loss on validation set is 0.24065192


In [65]:
# More fine grained accuracy calculations
huge_score = np.squeeze(np.argwhere(y_val < 0.3))
print('Data points with score less than 0.3 %s:' % len(huge_score))
val_data = x_val[huge_score,:]
y_true = y_val[huge_score]
predictions = model.predict(val_data)
loss = mse(y_true, predictions)
print('Loss on validation set is %s' % loss.numpy())

Data points with score less than 0.3 460:
Loss on validation set is 0.074392706


In [66]:
# More fine grained accuracy calculations
huge_score = np.squeeze(np.argwhere((y_val > 0.3) & (y_val < 0.7)))
print('Data points with score greater than 0.3 and less than 0.7 %s:' % len(huge_score))
val_data = x_val[huge_score,:]
y_true = y_val[huge_score]
predictions = model.predict(val_data)
loss = mse(y_true, predictions)
print('Loss on validation set is %s' % loss.numpy())

Data points with score greater than 0.3 and less than 0.7 403:
Loss on validation set is 0.023289338
