In [1]:
!pip install pandas
!pip install numpy
!pip install tensorflow
!pip install scikit-learn



In [2]:
import pandas as pd
import os
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.metrics import confusion_matrix, classification_report, f1_score

In [3]:
# Read all the data
labels = []
raw_inputs = []
df = pd.read_csv('./raw_data.csv')
max_frames = 0
max_words = 0
max_sequence_length = 0

for index, row in df.iterrows():
    path = os.path.join('./processed_video', 'audio_' + row['video_id'] + '_txt')
    
    if os.path.isfile(path):
        with open(path, 'r') as f:
            try:
                raw_inputs_video = []
                sequence_length = 0
                file_data = json.load(f)
                num_frames = len(file_data)
                
                if num_frames > max_frames:
                    max_frames = num_frames

                for i in range(num_frames):
                    line_data = json.loads(file_data[i])

                    if 'DisplayText' in line_data.keys():
                        sentence = line_data['DisplayText']
                        num_words = len(sentence.split())
                        sequence_length = sequence_length + num_words

                        if num_words > max_words:
                            max_words = num_words
                        raw_inputs_video.append(sentence)

                    del line_data
                
                raw_inputs.append(raw_inputs_video)
                labels.append((row['video_likeCount'] - row['video_dislikeCount'])/row['video_viewCount'])
                
                if sequence_length > max_sequence_length:
                    max_sequence_length = sequence_length

                del raw_inputs_video, file_data, sequence_length
            except ValueError:
                print(path + ' failed processing')

raw_inputs = np.array(raw_inputs)
labels = np.array(labels)
print()
print('Shape of raw_inputs: ', raw_inputs.shape)
print('Shape of labels: ', labels.shape)
print('Max Words: ', max_words)
print('Max Frames: ', max_frames)
print('Max Sequence Lenghth: ', max_sequence_length)
np.save('./text.npy', raw_inputs)

./processed_video/audio_xCgk9nvuCxk_txt failed processing
./processed_video/audio_Q_ouhkdo-ko_txt failed processing

Shape of raw_inputs:  (3389,)
Shape of labels:  (3389,)
Max Words:  172
Max Frames:  186
Max Sequence Lenghth:  3453


In [4]:
label_series = pd.Series(labels)
print(label_series.value_counts(bins=20).sort_index())

(-0.021500000000000002, -0.0122]       1
(-0.0122, -0.00398]                    7
(-0.00398, 0.00421]                  225
(0.00421, 0.0124]                   1094
(0.0124, 0.0206]                    1021
(0.0206, 0.0288]                     613
(0.0288, 0.037]                      264
(0.037, 0.0452]                      103
(0.0452, 0.0533]                      31
(0.0533, 0.0615]                      17
(0.0615, 0.0697]                       5
(0.0697, 0.0779]                       2
(0.0779, 0.0861]                       1
(0.0861, 0.0943]                       1
(0.0943, 0.102]                        0
(0.102, 0.111]                         1
(0.111, 0.119]                         1
(0.119, 0.127]                         0
(0.127, 0.135]                         1
(0.135, 0.143]                         1
dtype: int64


In [5]:
# Prepare data for cnn lstm by concatenating frames, normalizing labels and defining constants
text_inputs = [''.join(row) for row in raw_inputs]
labels = np.clip(labels, 0.00421, 0.037)
min_label = np.min(labels)
max_label = np.max(labels)
scaled_labels = (labels - min_label)/(max_label - min_label)

# convert labels to one-hot vectors
one_hot_labels = np.zeros((len(scaled_labels), 5))

for index, label in enumerate(scaled_labels):
    if label >= 0 and label <= 0.2:
        one_hot_labels[index, 0] = 1
    elif label > 0.2 and label <= 0.4:
        one_hot_labels[index, 1] = 1
    elif label > 0.4 and label <= 0.6:
        one_hot_labels[index, 2] = 1
    elif label > 0.6 and label <= 0.8:
        one_hot_labels[index, 3] = 1
    elif label > 0.8 and label <= 1:
        one_hot_labels[index, 4] = 1

one_hot_labels = np.array(one_hot_labels)

MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = max_sequence_length
VALIDATION_SPLIT = 0.3
GLOVE_DIR = './glove.twitter.27B'
EMBEDDING_DIM = 200

In [6]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text_inputs)
sequences = tokenizer.texts_to_sequences(text_inputs)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', one_hot_labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = one_hot_labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Shape of training tensor:', x_train.shape)
print('Shape of training labels:', y_train.shape)
print('Shape of testing tensor:', x_val.shape)
print('Shape of testing labels:', y_val.shape)

Found 38587 unique tokens.
Shape of data tensor: (3389, 3453)
Shape of label tensor: (3389, 5)
Shape of training tensor: (2373, 3453)
Shape of training labels: (2373, 5)
Shape of testing tensor: (1016, 3453)
Shape of testing labels: (1016, 5)


In [7]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [8]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [9]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(60, 5, activation='relu', kernel_regularizer=l1_l2(l1=1.e-20, l2=1.e-20))(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(60, 5, activation='relu', kernel_regularizer=l1_l2(l1=1.e-20, l2=1.e-20))(x)
x = MaxPooling1D(5)(x)
x = Conv1D(60, 5, activation='relu', kernel_regularizer=l1_l2(l1=1.e-20, l2=1.e-20))(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
preds = Dense(5, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=[tf.keras.metrics.CategoricalAccuracy()])

print(model.summary())

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=15, batch_size=128)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 3453)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 3453, 200)         7717600   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 3449, 60)          60060     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 689, 60)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 685, 60)           18060     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 137, 60)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 133, 60)           1806

In [13]:
y_pred = np.argmax(model.predict(x_val), axis = 1)
y_true = np.argmax(y_val, axis = 1)
print(classification_report(y_true, y_pred, labels = [0,1,2,3,4], zero_division=0))
print()
print("The final F1 micro score for the model based one arly fusion is: ", f1_score(y_true, y_pred, average = 'micro', labels=[0,1,2,3,4]))
print()
print("Detailed Confusion Matrix")
print()
print(confusion_matrix(y_true, y_pred, labels = [0, 1, 2, 3, 4]))

              precision    recall  f1-score   support

           0       0.41      0.56      0.47       324
           1       0.26      0.19      0.22       267
           2       0.24      0.43      0.31       200
           3       0.12      0.01      0.02       122
           4       0.10      0.01      0.02       103

    accuracy                           0.32      1016
   macro avg       0.23      0.24      0.21      1016
weighted avg       0.27      0.32      0.27      1016


The final F1 micro score for the model based one arly fusion is:  0.3169291338582677

Detailed Confusion Matrix

[[183  58  78   3   2]
 [128  51  86   0   2]
 [ 68  40  86   3   3]
 [ 44  19  56   1   2]
 [ 26  27  48   1   1]]
