In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import os
import pandas as pd
from PIL import Image
# function for getting full transcript from vtt
import webvtt

In [2]:
import pandas as pd
finished_df = pd.read_csv('finished_df.csv', index_col=0)

In [8]:
# define functions

# makes np array of sizexsize from PIL image input
def img_to_arr(input_image, size : int = 224 ) -> np.array:
    data = np.asarray(input_image.resize((size, size)))
    return data

def folder_to_array(foldername : str, size: int = 224) -> dict:
    arrays = dict()
    num_errors = 0
    for image_file in os.scandir(foldername):
        try:
            image_memory = Image.open(foldername + "\\" + image_file.name)
            array = img_to_arr(image_memory, size)
            arrays[image_file.name] = array
        except Exception as e:
            print(str(e))

    return arrays

In [None]:
thumbnails_dict = folder_to_array('thumbnails')

In [13]:
thumbnails_dict

{'--6KU6oa9-Y.jpg': array([[[  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0],
         ...,
         [  0,   0,   0],
         [  0,   0,   0],
         [  1,   0,   0]],
 
        [[  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0],
         ...,
         [  0,   0,   0],
         [  0,   0,   0],
         [  1,   0,   0]],
 
        [[  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0],
         ...,
         [  0,   0,   0],
         [  0,   0,   0],
         [  1,   0,   0]],
 
        ...,
 
        [[105, 108,  96],
         [110, 112, 101],
         [110, 113, 103],
         ...,
         [115, 117, 114],
         [ 91,  92,  90],
         [ 43,  44,  41]],
 
        [[ 66,  36,  19],
         [ 64,  34,  18],
         [ 65,  35,  17],
         ...,
         [ 29,  23,  21],
         [ 26,  20,  18],
         [ 16,  10,   8]],
 
        [[ 79,  29,   1],
         [ 77,  28,   0],
         [ 78,  29,   0],
         ...,
         

In [14]:
finished_df['stance'] = finished_df['stance'].apply(pd.to_numeric, errors = 'coerce').fillna(1)
finished_df['stance']

0       0
1       0
2       0
3       0
4       0
       ..
2667    1
2668    1
2669   -1
2670    0
2671   -1
Name: stance, Length: 2672, dtype: int64

In [15]:
finished_df['stance'] = finished_df['stance'] + 1

In [16]:
finished_df['annotation'] = finished_df['annotation'].astype('string')
finished_df = finished_df.dropna()
finished_df['annotation']

0        ten years after 9/11 a new report out just th...
1        [Music] it happened outside Waco Texas a heav...
2        thanks for coming it's nice to see a good tur...
3        &gt;&gt; Tonight... &gt;&gt; I, Donald John T...
4        - This week on Buzzfeed Unsolved, we discuss ...
                              ...                        
2666     astronauts moving in slow motion is another h...
2667     dr. david groves is a physicist in the UK and...
2668     - Hi, I'm Matt. And I don't believe we landed...
2670     hi everybody what i'd like to talk about for ...
2671     Apparently, there's an organization called "N...
Name: annotation, Length: 2664, dtype: string

In [17]:
finished_df

Unnamed: 0,annotation,stance
0,ten years after 9/11 a new report out just th...,1
1,[Music] it happened outside Waco Texas a heav...,1
2,thanks for coming it's nice to see a good tur...,1
3,"&gt;&gt; Tonight... &gt;&gt; I, Donald John T...",1
4,"- This week on Buzzfeed Unsolved, we discuss ...",1
...,...,...
2666,astronauts moving in slow motion is another h...,2
2667,dr. david groves is a physicist in the UK and...,2
2668,"- Hi, I'm Matt. And I don't believe we landed...",2
2670,hi everybody what i'd like to talk about for ...,1


In [8]:
# exploration
onehot = pd.get_dummies(finished_df['stance'])
onehot = onehot.rename({
    0: 'promote',
    1: 'neutral',
    2: 'debunk'
}, axis=1)
onehot

Unnamed: 0,promote,neutral,debunk
0,False,True,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False
...,...,...,...
2666,False,False,True
2667,False,False,True
2668,False,False,True
2670,False,True,False


In [9]:
finished_df['promote'] = onehot['promote']
finished_df['neutral'] = onehot['neutral']
finished_df['debunk'] = onehot['debunk']
finished_df

Unnamed: 0,annotation,stance,promote,neutral,debunk
0,ten years after 9/11 a new report out just th...,1,False,True,False
1,[Music] it happened outside Waco Texas a heav...,1,False,True,False
2,thanks for coming it's nice to see a good tur...,1,False,True,False
3,"&gt;&gt; Tonight... &gt;&gt; I, Donald John T...",1,False,True,False
4,"- This week on Buzzfeed Unsolved, we discuss ...",1,False,True,False
...,...,...,...,...,...
2666,astronauts moving in slow motion is another h...,2,False,False,True
2667,dr. david groves is a physicist in the UK and...,2,False,False,True
2668,"- Hi, I'm Matt. And I don't believe we landed...",2,False,False,True
2670,hi everybody what i'd like to talk about for ...,1,False,True,False


In [10]:
y = onehot.values
y

array([[False,  True, False],
       [False,  True, False],
       [False,  True, False],
       ...,
       [False, False,  True],
       [False,  True, False],
       [ True, False, False]])

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    finished_df['annotation'], y, test_size=0.15, shuffle=True
)

In [12]:
X_train

1827     vaccine-preventable diseases like polio have ...
575      We're gonna talk about the media, specificall...
1493     [Music] lived off in less than a century Amer...
141      on 911 dr. Benjamin luff prepared Stony Brook...
1970     So I was watching Hannity's show a few weeks ...
                              ...                        
409      oh honey nobody just sounded [Music] on a per...
1551     so is uh come on oh hey columbia columbia hou...
2565     you by a British man in speaking welcome peop...
148      (heart beating) (siren wailing) - This week o...
2119     [Applause] [Music] hello my name is Gavin McI...
Name: annotation, Length: 2264, dtype: string

In [13]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [14]:
# textual element

# bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# nn layers
# l1 = tf.keras.layers.Dropout(0.1)(outputs['pooled_output'])
l1 = tf.keras.layers.Dropout(0.1)(outputs['sequence_output'])
l2 = tf.keras.layers.LSTM(units=64)(l1)
# l3 = tf.keras.layers.Dropout(0.1)(l2)

# CHANGE OUTPUT LAYER AFTER ADJUSTING 
caption_model = tf.keras.Model(inputs = [text_input], outputs = [l2])

In [None]:
feature_extractor = hub.KerasLayer("https://tfhub.dev/google/imagenet/resnet_v1_101/feature_vector/5")

# image element
image_input = tf.keras.layers.Input()
feature_extraction_layer = feature_extractor(image_input)

image_model = tf.keras.Model(inputs = [image_input], outputs = [feature_extraction_layer])

In [None]:
# concatenation
combination_layer = tf.keras.layers.Concatenate([caption_model, image_model])
hidden_dense = tf.keras.layers.Dense(units=64)(combination_layer)
combined_output = tf.keras.layers.Dense(units=3)(l2)

combined_model = tf.keras.Model(inputs = [image_input, text_input], outputs = [combined_output])

In [16]:
combined_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['input_1[0][0]']                
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [17]:
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

In [18]:
combined_model.compile(
    loss=loss,
    metrics=['accuracy'],
    optimizer='adam'
)

In [19]:
y_train = np.array(y_train).astype('int64')

In [20]:
y_train

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0]], dtype=int64)

In [22]:
X_train_images = None
history = combined_model.fit([X_train, X_train_images], y_train)

