In [None]:
import json
import csv
import os
import argparse
import tensorflow as tf
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import CuDNNLSTM
from keras.layers import Embedding
from numpy import array
import time
from ibm_watson import DiscoveryV1

In [None]:
def get_discovery_data(director, origin, year, genre):
    movies = []
    # Enter Discovery ID
    env_id = '' 
    col_id = ''
    discovery = DiscoveryV1(
        version = '2018-08-01',
        # Enter Discovery Key
        iam_apikey = ' '
        url = 'https://gateway.watsonplatform.net/discovery/api'
    )
    num = 100
    sentence = director
    filterParam = None
    response1 = discovery.query(environment_id = env_id, collection_id = col_id, filter = filterParam, query = sentence, count = num)
    res = response1.result['results']
    for movie in res:
        if movie['result_metadata']['score']>6:
            movies.append({'title':movie['Title'],'plot':movie['Plot']})
    filterParam = 'Origin:'+origin
    sentence = genre
    response1 = discovery.query(environment_id = env_id, collection_id = col_id, filter = filterParam, query = sentence, count = num)
    res = response1.result['results']
    for movie in res:
        if movie['Title'] in str(movies):
            pass
        else:
            movies.append({'title':movie['Title'],'plot':movie['Plot']})
            
    return movies

In [None]:
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

def sequence_line(line, n):
    sequences = list()
    for i in range(n, len(line)) :
        sequence = line[i-n: i]
        sequences.append(sequence)
    return sequences

In [None]:
def generate_model (title, year, genre, director, origin):
    ngram = 3
    epochs_num = 1000
    limit = 100
    gen = genre
    genre = genre.replace(' ','')
    genres = genre.split(',')

    filename = genre + "_" + str(ngram) + "_" + str(epochs_num) + "_model.h5"
    data = []
    f = open('movies.json','r')
    while True:
        s = f.readline()
        if len(s) < 1:
            break
        data.append(json.loads(s))
    f.close()
    print("There are "+str(len(data))+" movies")
    
    year_data = []
    for a in data:
        if a["Year"]>=str(int(year)-5) and a["Year"]<str(int(year)+5):
            year_data.append(a)
        
    random.shuffle(year_data)
    print("There are "+year+"+-5 "+str(len(year_data))+" movies")

    limited_data = []
    for pre_data in data:
        for g in genres:
            if g in pre_data["Genre"]:
                limited_data.append(pre_data)

    limited_data = limited_data[0:limit]
    
    disdata = get_discovery_data(director, origin, year, gen)
    plots = [movie['Plot'] for movie in limited_data]
    for d in disdata:
        plots.append(d['plot'])
    
    print("Using {0} movie plots".format(len(plots)))
        
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(plots)
    encoded = tokenizer.texts_to_sequences(plots)
    
    vocab_size = len(tokenizer.word_index)+1
    print('vocab size = {0}'.format(vocab_size))
    
    print('processing {0}_grams'.format(ngram))
    
    sequences = list()
    for line in encoded:
        sequences.extend(sequence_line(line, ngram))

    sequences = array(sequences)
    x, y = sequences[:,0:-1], sequences[:,-1]
    
    model = None
    title = title.replace(' ','')
    filename = title + "_" + str(ngram) + "_" + str(epochs_num) + "_model.h5"
    if(os.path.isfile(filename)):
       model = load_model(filename)
    if model is None:
        model = Sequential()
        model.add(Embedding(input_dim=vocab_size, output_dim=10, input_length = ngram-1))
        model.add(LSTM(50))
        model.add(Dense(vocab_size, activation='softmax'))
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(x,y, epochs=epochs_num, verbose=2, batch_size=256)
        model.save(filename)
        
    return model, tokenizer

In [None]:
# Implementation
year = '1997'
genre = 'Drama, Romance'
ngram = 3
genre = genre.lower()
title = 'Good Will Hunting'
director = 'Gus Van Sant'
origin = 'American'

model, tokenizer = generate_model (title, year, genre, director, origin)
print(generate_seq(model, tokenizer, ngram-1, 'A rich man', 24))
print(generate_seq(model, tokenizer, ngram-1, 'He loves', 24))
print(generate_seq(model, tokenizer, ngram-1, 'They married in', 10))