In [1]:
import json

data_path = './Artificial-Intelligence/AI_reboot/Keras/Datas/RNN/wp_movies_10k.ndjson'

with open(data_path) as fin:
    movies = [json.loads(rep) for rep in fin]


In [48]:
from collections import Counter
link_counts = Counter()

for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(3)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867)]

In [49]:
top_links = [link for link, c in link_counts.items() if c>=3]
link_2_idx = {link: idx for idx, link in enumerate(top_links)}
movie_2_idx = {movie[0] : idx for idx, movie in enumerate(movies)}

In [51]:
pair = []
for movie in movies:
    pair.extend((link_2_idx[link], movie_2_idx[movie[0]])
                for link in movie[2] if link in link_2_idx)
pair_set = set(pair)

In [52]:
from keras.layers import Input, Embedding, Reshape
from keras.models import Model
import numpy as np
from keras.layers.merge import Dot
import random

def movie_embedding_model(embedding_size=30):
    link = Input(name = 'link', shape=(1,))
    movie = Input(name = 'movie', shape = (1,))
    link_embedding = Embedding(name = 'link_embedding', input_dim = len(top_links), output_dim=embedding_size)(link)
    movie_embedding = Embedding(name = 'movie_embedding', input_dim = len(movie_2_idx), output_dim=embedding_size)(movie)
    dot = Dot(name = 'dot_product',axes = 2, normalize=True)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 30)        2007390     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 30)        300000      movie[0][0]                      
____________________________________________________________________________________________

In [53]:
random.seed(5)

def batchifier(pair, positive_samples = 50, negative_ratio = 5):
    batch_size = positive_samples*(1+negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pair, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        
        while idx < batch_size:
            movie_id = random.randrange(len(movie_2_idx))
            link_id = random.randrange(len(top_links))
            
            if not (link_id, movie_id) in pair_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
            np.random.shuffle(batch)
            yield {'link': batch[:, 0], 'movie': batch[:,1]}, batch[:,2]
                

In [18]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config = config)

positive_sample_per_batch = 512
model.fit_generator(batchifier(pair, positive_samples = positive_sample_per_batch, negative_ratio=10),
                    epochs = 25, steps_per_epoch=len(pair) // positive_sample_per_batch,
                    verbose=2)

Epoch 1/25


 - 32s - loss: 0.0012


Epoch 2/25


 - 31s - loss: 0.0012


Epoch 3/25


 - 31s - loss: 0.0013


Epoch 4/25


 - 31s - loss: 0.0015


Epoch 5/25


 - 31s - loss: 0.0012


Epoch 6/25


 - 31s - loss: 0.0015


Epoch 7/25


 - 31s - loss: 0.0016


Epoch 8/25


 - 31s - loss: 0.0014


Epoch 9/25


 - 31s - loss: 0.0015


Epoch 10/25


 - 31s - loss: 0.0014


Epoch 11/25


 - 31s - loss: 0.0014


Epoch 12/25


 - 31s - loss: 0.0018


Epoch 13/25


 - 31s - loss: 0.0013


Epoch 14/25


 - 31s - loss: 0.0017


Epoch 15/25


 - 34s - loss: 0.0019


Epoch 16/25


 - 31s - loss: 0.0013


Epoch 17/25


 - 32s - loss: 0.0015


Epoch 18/25


 - 33s - loss: 0.0022


Epoch 19/25


 - 36s - loss: 0.0016


Epoch 20/25


 - 40s - loss: 0.0028


Epoch 21/25


 - 36s - loss: 0.0020


Epoch 22/25


 - 39s - loss: 0.0018


Epoch 23/25


 - 32s - loss: 0.0021


Epoch 24/25


 - 34s - loss: 0.0017


Epoch 25/25


 - 33s - loss: 0.0018


<keras.callbacks.History at 0x1f660517320>

In [54]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]

lens = np.linalg.norm(movie_weights, axis=1)
normalized = (movie_weights.T/lens).T

lens, normalized

(array([0.17623569, 0.15901437, 0.14038229, ..., 0.17390047, 0.15246683,
        0.15833506], dtype=float32),
 array([[-0.04233225, -0.19315684,  0.27852142, ...,  0.17255147,
         -0.17729752,  0.2398816 ],
        [-0.03761582,  0.16209969,  0.10093085, ..., -0.17638569,
          0.219261  ,  0.26952153],
        [-0.02847338,  0.264888  ,  0.05882081, ...,  0.28944448,
         -0.29681388, -0.28397623],
        ...,
        [ 0.20009045,  0.22480172, -0.17271577, ...,  0.28425655,
         -0.26119947,  0.03757563],
        [ 0.31772807,  0.12799498,  0.17518868, ...,  0.14878696,
          0.24458723, -0.06421117],
        [ 0.0102815 , -0.25760958,  0.03863136, ...,  0.12341172,
         -0.00209643, -0.27583605]], dtype=float32))

In [55]:
def neighbors(movie):
    digits = np.dot(normalized, normalized[movie_2_idx[movie]])
    closest = np.argsort(digits)[-10:]
    for rep in reversed(closest):
        print(rep, movies[rep][0], digits[rep])
neighbors('The Danish Girl (film)')

27 The Danish Girl (film) 1.0
1885 Push (2009 film) 0.694112
9693 The Banger Sisters 0.6342791
8095 Justin Bieber's Believe 0.6220367
1398 Weird Science (film) 0.5574963
2805 Dorian Gray (2009 film) 0.5505457
6068 The Mirror Crack'd 0.54453766
6677 Luck (film) 0.54203284
2779 American Pastoral (film) 0.5384179
4967 Last Man Standing (film) 0.536665


In [56]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']

In [57]:
y = np.asarray([1 for _ in best]+[0 for _ in worst])
x = np.asarray([normalized[movie_2_idx[movie]] for movie in best+worst])

from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(x, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [58]:
estimated_movie_ratings = clf.decision_function(normalized)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])
    
print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])
    

best:
6702 Cradle Will Rock 1.3137620125649017
2217 Agora (film) 1.17759131949704
3556 The 9th Life of Louis Drax 1.0847499258300148
468 Dirty Dancing 1.0783623481743891
7787 Essex Boys 1.0633604996513664
worst:
6461 One More Chance (2007 film) -2.1407119086851503
7448 Cannibal! The Musical -2.009132021579877
9446 Angel (2007 film) -1.9226658879690866
8498 California Suite (film) -1.9133332742126727
8748 Manmadhan (film) -1.8508453390155672


In [60]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100
                       for movie in movies if movies[-2]])
rotten_x = np.asarray([normalized[movie_2_idx[movie[0]]]
                       for movie in movies if movies[-2]])

TypeError: 'NoneType' object is not subscriptable

In [45]:
from sklearn.linear_model import LinearRegression

training_cut = int(len(rotten_x) * 0.8)
regr = LinearRegression()
regr.fit(rotten_x[:training_cut], rotten_y[:training_cut])

error = (regr.predict(rotten_x[training_cut:]) - rotten_y[training_cut:])

f'mean square error {error}'

NameError: name 'rotten_x' is not defined