In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import Input, Lambda, Dense
from tensorflow.keras import layers
from tensorflow.keras.models import Model
import keras.backend as K

Using TensorFlow backend.


In [2]:
df = pd.read_csv("../data/mpst_full_data.csv")

In [3]:
# https://tfhub.dev/google/universal-sentence-encoder/4 
embed = hub.KerasLayer("../models/use/")

In [4]:
def UniversalEmbeddingfn(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)))

In [6]:
inputs = np.array([["comedy"], ["funny"],["http://example.com"]])
inputs.shape

(3, 1)

In [7]:
UniversalEmbeddingfn(inputs)

<tf.Tensor: shape=(3, 512), dtype=float32, numpy=
array([[-0.05970578,  0.03881067,  0.05916519, ..., -0.05454944,
         0.0762557 ,  0.0176069 ],
       [-0.0169152 ,  0.04611888,  0.03401598, ..., -0.06357985,
         0.01241454,  0.01053265],
       [-0.00988075, -0.00547772,  0.05957529, ..., -0.02145737,
         0.03282751, -0.04534928]], dtype=float32)>

In [8]:
class UniversalEmbedding(layers.Layer):
    def __init__(self, embed):
        super(UniversalEmbedding, self).__init__()
        self.embed = embed

    def call(self, x):
        return self.embed(tf.squeeze(tf.cast(x, tf.string), axis=1))
#         return self.embed(tf.reshape(tf.cast(x, tf.string), shape=tf.shape(x)))

In [9]:
# x = np.array([inputs[0]])
x = inputs
tf.squeeze(tf.cast(x, tf.string), axis=1)

<tf.Tensor: shape=(3,), dtype=string, numpy=array([b'comedy', b'funny', b'http://example.com'], dtype=object)>

In [11]:
linear_layer = UniversalEmbedding(embed)
# y = linear_layer(inputs)
y = linear_layer(np.array([inputs[0]]))

In [12]:
input_text = Input(shape=(1,), dtype=tf.string)
embedding = UniversalEmbedding(embed)(input_text)
# embedding = Lambda(UniversalEmbedding_fn, output_shape=(512, ))(input_text)
dense = Dense(256, activation='relu')(embedding)
pred = Dense(10, activation='softmax')(dense)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
universal_embedding_2 (Unive (None, 512)               256797824 
_________________________________________________________________
dense (Dense)                (None, 256)               131328    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                2570      
Total params: 256,931,722
Trainable params: 133,898
Non-trainable params: 256,797,824
_________________________________________________________________


In [14]:
tags = []
for i,j in enumerate(df.tags):
    temp = j.split(", ")
    tags.extend(temp)

In [15]:
mapping = {
    "action": ["action", "good versus evil", "revenge", "suicidal" ],
    "comedy": ["comedy", "comic", "humor", "prank","satire"  ],
    "cult":["cult", "melodrama" ],
    "fantasy":["avant garde", "fantasy", "home movie", "magical realism", "whimsical"],
    "thriller":["clever", "insanity","intrigue", "murder", "mystery", "neo noir", "psychological", "suspenseful"],
    "horror":["gothic", "grindhouse film", "haunting", "horror", "paranormal", ],
    "romantic":["cute", "queer","romantic"  ],
    "sci-fi":["alternate reality", "atmospheric", "sci-fi" ],
    "drama":["absurd", "boring", "adult comedy", "autobiographical", "dramatic", 
             "entertaining", "feel-good", "flashback", "historical","historical fiction", 
             "inspiring", "non fiction", "philosophical", "plot twist","realism", "sentimental", 
             "storytelling", "stupid", "thought-provoking", "tragedy", "western", "christian film"],
    "dark":["bleak", "claustrophobic", "blaxploitation", "cruelty", "dark", "depressing", "sadist", "violence", "psychedelic"]
    
}

In [16]:
def tag_finder(mapping, tag):
    for key, values in mapping.items():
        if tag in values:
            return key            

In [17]:
df["tags_final"] = None
for i,j in enumerate(df.tags):
    temp = j.split(", ")
    new_tags = []
    for a in temp:
        tag = tag_finder(mapping, a)
        if tag is not None:
            new_tags.append(tag)
    df.tags_final[i] = new_tags

In [18]:
#Removing movies that don't belong to any tag
df = df[df.tags_final.apply(lambda x: len(x)) != 0]

In [19]:
bit_mapping = {
    "action": 0,
    "comedy": 1,
    "cult":2,
    "fantasy":3,
    "thriller":4,
    "horror":5,
    "romantic":6,
    "sci-fi":7,
    "drama":8,
    "dark":9    
}

In [20]:
y_targets = np.zeros((len(df), len(bit_mapping)))

In [21]:
for movie, tags in enumerate(df.tags_final):
    no_of_tags = len(tags)
    weight_tags = 1/no_of_tags
    for tag in tags:
        index = bit_mapping[tag]
        y_targets[movie][index] = y_targets[movie][index] + weight_tags

In [22]:
texts = []
for i in df.plot_synopsis:
    if len(i)>1000:
        texts.append(i[0:1000])
    else:
        texts.append(i)

In [23]:
x = np.array(texts)

In [24]:
model.fit(x, y_targets, batch_size=32, epochs=12)

Train on 14719 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tensorflow.python.keras.callbacks.History at 0x7facf4091f50>

In [25]:
y_pred = model.predict(x)
y_pred 

array([[0.06352345, 0.0251822 , 0.0918977 , ..., 0.01170416, 0.11871923,
        0.1569769 ],
       [0.256484  , 0.01851137, 0.06536669, ..., 0.00804719, 0.04597718,
        0.32175803],
       [0.05428148, 0.10323779, 0.02313492, ..., 0.04784635, 0.1588573 ,
        0.04198252],
       ...,
       [0.10741819, 0.0275405 , 0.09650864, ..., 0.02954132, 0.12667611,
        0.19399863],
       [0.0780264 , 0.02716865, 0.03704562, ..., 0.00568172, 0.09172072,
        0.0977466 ],
       [0.21965103, 0.06991829, 0.05230707, ..., 0.0032561 , 0.25996262,
        0.08700375]], dtype=float32)

In [27]:
model.predict(np.array([x[25]]))

array([[0.1713618 , 0.03452521, 0.08965068, 0.05014998, 0.13980821,
        0.01821835, 0.02472308, 0.02285463, 0.1425099 , 0.30619818]],
      dtype=float32)