In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import random
from nltk import ngrams
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

* First Name & Last Name Check - bigrams !!
* spell check
* Prepare dataset to extract actors and directors based on verbs

In [5]:
file_path = 'dataset/moviedata.csv'
root_path = 'datset/'

In [6]:
df = pd.read_csv(file_path)
df = df[['Rank','Title','Genre','Director','Actors']]

In [7]:
genres = set()
directors = set()
actors = set()
for index,row in df.iterrows():
  entry = row["Genre"]
  for e in entry.split(","):
    genres.add(e.lower().strip())
  entry = row["Director"]
  for e in entry.split(","):
    directors.add(e.lower().strip())
  entry = row["Actors"]
  for e in entry.split(","):
    actors.add(e.lower().strip())
print("Found %d genres"%len(genres))
print("Found %d directors"%len(directors))
print("Found %d actors"%len(actors))

Found 20 genres
Found 644 directors
Found 1985 actors


In [8]:
training_data = []
for entry in genres:
  training_data.append([entry,0])
for entry in actors:
  training_data.append([entry,1])
for entry in directors:
  training_data.append([entry,2])
training_data = np.array(training_data)
training_data,len(training_data)

(array([['adventure', '0'],
        ['animation', '0'],
        ['thriller', '0'],
        ...,
        ['jon s. baird', '2'],
        ['steven r. monroe', '2'],
        ['dennis gansel', '2']], dtype='<U32'),
 2649)

In [9]:
batch_size = 32
batch_constant = int(len(training_data)/batch_size)
epochs = 10
learning_rate = 0.0005
dropout = 0.3
max_features = 1000
embedding_dim = 128
sequence_length = 1
vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)
vectorize_layer.adapt(training_data[:,0])

In [10]:
X_tr = training_data[:batch_constant*batch_size,0]
Y_tr = training_data[:batch_constant*batch_size,1].astype('int')

In [11]:
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

text_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='text')
x = vectorize_layer(text_input)
x = layers.Embedding(max_features + 1, embedding_dim)(x)
x = layers.Dropout(dropout)(x)

# Conv1D + global max pooling
x = layers.Conv1D(embedding_dim, 1, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(embedding_dim, 1, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(embedding_dim, activation="relu")(x)
x = layers.Dropout(dropout)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
y_pred = layers.Dense(3, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(text_input, y_pred)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(lr=learning_rate), metrics=["accuracy"])

In [12]:
# model.fit(np.array(X_tr),Y_tr, batch_size=batch_size,epochs=epochs,verbose=1)

In [13]:
# model.predict(["vin diesel"])

In [14]:
data = df.copy()
data=data.set_index(data.columns.drop('Genre',1).tolist()).Genre.str.split(',', expand=True).stack().reset_index().rename(columns={0:'Genre'}).loc[:, data.columns]
data=data.set_index(data.columns.drop('Director',1).tolist()).Director.str.split(',', expand=True).stack().reset_index().rename(columns={0:'Director'}).loc[:, data.columns]
data=data.set_index(data.columns.drop('Actors',1).tolist()).Actors.str.split(',', expand=True).stack().reset_index().rename(columns={0:'Actors'}).loc[:, data.columns]

In [15]:
data

Unnamed: 0,Rank,Title,Genre,Director,Actors
0,1,Guardians of the Galaxy,Action,James Gunn,Chris Pratt
1,1,Guardians of the Galaxy,Action,James Gunn,Vin Diesel
2,1,Guardians of the Galaxy,Action,James Gunn,Bradley Cooper
3,1,Guardians of the Galaxy,Action,James Gunn,Zoe Saldana
4,1,Guardians of the Galaxy,Adventure,James Gunn,Chris Pratt
...,...,...,...,...,...
10213,1000,Nine Lives,Family,Barry Sonnenfeld,Cheryl Hines
10214,1000,Nine Lives,Fantasy,Barry Sonnenfeld,Kevin Spacey
10215,1000,Nine Lives,Fantasy,Barry Sonnenfeld,Jennifer Garner
10216,1000,Nine Lives,Fantasy,Barry Sonnenfeld,Robbie Amell


In [105]:
data["Query"] = data["Genre"].str.cat(data[["Director", "Actors"]].astype(str), sep=" ")
training_data = data[["Title", "Query"]]
#training_data["similarity"] = 1

In [106]:
import itertools
lists = [movies,queries]
combo = pd.DataFrame(list(itertools.product(*lists)), columns=['Title','Query'])
#combo['Similarity'] = 0

In [107]:
combo

Unnamed: 0,Title,Query
0,"Hail, Caesar!",Drama Quentin Tarantino Christoph Waltz
1,"Hail, Caesar!",Crime Jean-François Richet Erin Moriarty
2,"Hail, Caesar!",Action Duncan Jones Paula Patton
3,"Hail, Caesar!",Action Mario Van Peebles Thomas Jane
4,"Hail, Caesar!",Thriller Fede Alvarez Jane Levy
...,...,...
9908077,The Judge,Sci-Fi Darren Aronofsky Ellen Burstyn
9908078,The Judge,Drama Alexi Pappas Chase Offerle
9908079,The Judge,Animation John Lasseter Emily Mortimer
9908080,The Judge,Thriller Christopher Smith Bel Powley


In [108]:
training_data

Unnamed: 0,Title,Query
0,Guardians of the Galaxy,Action James Gunn Chris Pratt
1,Guardians of the Galaxy,Action James Gunn Vin Diesel
2,Guardians of the Galaxy,Action James Gunn Bradley Cooper
3,Guardians of the Galaxy,Action James Gunn Zoe Saldana
4,Guardians of the Galaxy,Adventure James Gunn Chris Pratt
...,...,...
10213,Nine Lives,Family Barry Sonnenfeld Cheryl Hines
10214,Nine Lives,Fantasy Barry Sonnenfeld Kevin Spacey
10215,Nine Lives,Fantasy Barry Sonnenfeld Jennifer Garner
10216,Nine Lives,Fantasy Barry Sonnenfeld Robbie Amell


In [82]:
i=0
for index, row in training_data.iterrows():
    movie = row['Title']
    query = row['Query']
    row_indexer = combo[((combo['Title'] == movie) & (combo['Query'] == query))].index
    combo.loc[row_indexer,'Similarity'] = 1
    i += 1
    clear_output()
    print("Iteration : %d"% i, flush=True) 

Iteration : 48


KeyboardInterrupt: 

In [66]:
((combo['Title'] == 'Hail, Caesar!') & (combo['Query'] == 'Crime Jean-François Richet  Erin Moriarty')).any()

True

In [78]:
row_indexer = combo[((combo['Title'] == 'Hail, Caesar!') & (combo['Query'] == 'Crime Jean-François Richet  Erin Moriarty'))].index
combo.loc[row_indexer,'Similarity'] = 1

In [None]:
import sys
from IPython.display import clear_output 

movies = set(df['Title'])
queries = set(data['Query'])
i = 0
for query in queries:
  for movie in movies:
    if not ((training_data['Title'] == movie) & (training_data['Query'] == query)).any():
      training_data.loc[len(training_data)] = [movie,query,0]
    i += 1
    clear_output()
    print("Iteration : %d"% i, flush=True) 

In [None]:
training_data

In [None]:
training_data.to_csv(root_path+"movie_prep_data.csv")

In [None]:
((data['Title'] == 'Guardians of the Galaxy') & (data['Query'] == 'Action James Gunn Chris Pratt')).any()

In [84]:
combo[((combo['Title'] == training_data['Title']) & (combo['Query'] == training_data['Query']))].index

ValueError: Can only compare identically-labeled Series objects

In [36]:
  text_input = tf.keras.Input(shape=(1,), dtype=tf.string, name='text')
x = vectorize_layer(text_input)
x = layers.Embedding(max_features + 1, embedding_dim)(x)
x = layers.Dropout(dropout)(x)
lstm_layer = layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2))
emb = layers.Embedding(max_features + 1, embedding_dim)(x)
input1 = tf.keras.Input(shape=(300,))
e1 = emb(input1)
x1 = lstm_layer(e1)
input2 = tf.keras.Input(shape=(300,))
e2 = emb(input2)
x2 = lstm_layer(e2)
mhd = lambda x: tf.keras.backend.abs(x[0] - x[1])
merged = tf.keras.layers.Lambda(function=mhd, output_shape=lambda x: x[0],
name='L1_distance')([x1, x2])
preds = tf.keras.layers.Dense(1, activation='sigmoid')(merged)
model = tf.keras.Model(inputs=[input1, input2], outputs=preds)
model.compile(loss='mse', optimizer='adam')