In [1]:
import numpy as np
import urllib
import pandas as pd 

import requests
import os
import json

from IPython.display import Image
from IPython.display import HTML
from IPython.display import display
from PIL import Image

from keras.applications import VGG16
from keras.preprocessing import image as kimage
from keras.applications.vgg16 import preprocess_input

Using TensorFlow backend.


<h3>1. Load the movie data</h3>

In [2]:
#read in files
df = pd.read_csv('df.csv', sep=',', encoding = 'iso-8859-1')

<h3>2. Define function to query poster urls for movies</h3>

In [3]:
"""Input: IMDB movie ID
   Outpu: poster url for that movie"""

def getPoster(imdbid):
    try:
        url = "https://api.themoviedb.org/3/find/tt{:}?api_key=bb3beb7ec7af6d1c0c23ca7381b62a89&external_source=imdb_id".format(imdbid)
        response = requests.get(url)
        path = json.loads(response.text)
        image_root = path['movie_results'][0]['poster_path']
        image_path = 'http://image.tmdb.org/t/p/w185/' + image_root   
    except:
        image_path = ""    
    return image_path

<h3>3. Query movie posters</h3>

In [29]:
#make imdb id equal length
imdbid = [str(i).zfill(7) for i in df.imdbId]
imdbid = list(set(imdbid))

In [None]:
# get the poster url for each movies
image_url = []
for i in imdbid:
    image_url.append(getPoster(i))

<h3>4. Put IMDB IDs, genres, poster urls into a dataframe</h3>

In [56]:
# map imdb ids with genres 
genreList = [i.encode('ascii','ignore').split('|') for i in list(df.genres)]
genreDict = pd.DataFrame(zip(imdbidGenre, genreList))
genreDict.columns = ['imdbid', 'genre']
genreDict = genreDict.drop_duplicates(['imdbid'])

In [7]:
df_image = pd.DataFrame({'imdbid': imdbid, 
                         'image_url':image_url,
                         'idx':range(len(imdbid))}
                         )

In [59]:
# update the df_image
df_image = pd.merge(df_image, genreDict, left_on=['imdbid'],
              right_on=['imdbid'],
              how='inner')

In [62]:
%store df_image

Stored 'df_image' (DataFrame)


In [5]:
%store -r df_image

<h3>5. Download movie posters to local directory</h3>

In [6]:
poster_path = os.getcwd() +"/posters/"

In [10]:
""""""
###create gray image for missing poster"""
arr = np.random.randint(120,130, 185*278)
arr.resize((278,185))
gray = Image.fromarray(arr.astype('uint8'))
###

In [None]:
for i in range(len(df_image)):
    try:
        urllib.urlretrieve(df_image.image_url[i], poster_path + str(i) + '.jpg')
    except:
        # if not find posters, replace by a grey image
        gray.convert('RGB').save(poster_path + str(i) + '.jpg', 'JPEG')

<h3>6. Train the model</h3>

In [18]:
#preprocess image
img = [0]*len(df_image)
x = [0]*len(df_image)
for i in range(len(df_image)):
    img[i] = kimage.load_img(poster_path + str(i) + ".jpg", target_size=(224, 224))
    x[i] = kimage.img_to_array(img[i])
    x[i] = np.expand_dims(x[i], axis=0)
    x[i] = preprocess_input(x[i])
    
# image_top=False removes final connected layers
model = VGG16(include_top=False, weights='imagenet') 

# create prediction
pred = [0]*len(df_image)
pred_norm = [0]*len(df_image)
matrix_res = np.zeros([len(df_image), 25088])
for i in range(len(df_image)):
    pred[i] = model.predict(x[i]).ravel()
    matrix_res[i,:] = pred[i]  

In [22]:
%store matrix_res

Stored 'sim' (ndarray)
Stored 'matrix_res' (ndarray)
Stored 'pred' (list)


In [7]:
%store -r matrix_res

<h3>7. Write recommendation function</h3>

In [88]:
def recommendation(myinput):
    # find poster 
    urlencoded = urllib.urlencode({'query':myinput})
    url = 'https://api.themoviedb.org/3/search/movie?api_key=bb3beb7ec7af6d1c0c23ca7381b62a89&language=en-US&{:}&page=1&include_adult=false'.format(urlencoded)
    response = requests.get(url)
    path = json.loads(response.text)
    posterurl = 'http://image.tmdb.org/t/p/w185' + path['results'][0]['poster_path'].encode('ascii','ignore')

    # save poster
    current_path = os.getcwd() +"/posters/"
    try:
        urllib.urlretrieve(posterurl, current_path + 'myinput' + '.jpg')
    except:
        # if not find posters, replace by a grey image
        ###create gray image for missing poster"""
        arr = np.random.randint(120,130, 185*278)
        arr.resize((278,185))
        gray = Image.fromarray(arr.astype('uint8'))
        gray.convert('RGB').save(current_path + 'myinput' + '.jpg', 'JPEG')

    # find genre
    genre_id = path['results'][0]['genre_ids']
    genreL = 'https://api.themoviedb.org/3/genre/movie/list?api_key=bb3beb7ec7af6d1c0c23ca7381b62a89&language=en-US'
    response2 = requests.get(genreL)
    path2 = json.loads(response2.text)
    gen_df = pd.DataFrame(path2['genres'])
    movie_gen = [list(gen_df[gen_df['id'] == i]['name'])[0].encode('ascii','ignore') for i in genre_id]

    # process input movie poster
    img = kimage.load_img(current_path + 'myinput' + ".jpg", target_size=(224, 224))
    x = kimage.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)    
    model = VGG16(include_top=False, weights='imagenet') 
    mypre = model.predict(x).ravel()
    
    # create similarity matrix
    mysim = matrix_res.dot(mypre.T)
    mynorms = np.array([np.sqrt(mysim)])
    mysim = mysim / mynorms / mynorms.T
    
    # check if our trained dataset has the posters for the input movie
    # if so, remove the most similar poster(which is the input movie itself)
    if (path['results'][0]['id'] in list(df.tmdbId)):
        rec_movie_imdbid = [list(df_image[df_image['idx'] == x]['imdbid'])[0] for x in np.argsort(mysim[0,:])[:-3:-1]][1:]
    else:
        rec_movie_imdbid = [list(df_image[df_image['idx'] == x]['imdbid'])[0] for x in np.argsort(mysim[0,:])[:-3-1:-1]]
        
    # order the recommendation list by the number of shared genre
    #rec_genre = [list(df_image[df_image['imdbid'] == i]['genre'])[0] for i in rec_movie_imdbid]
    #sameGenreNum = [len(set.intersection(set(movie_gen), set(i))) for i in rec_genre] 
    #final = [x for (y,x) in sorted(zip(sameGenreNum, rec_movie_imdbid))][::-1][:10]
    
    return rec_movie_imdbid, posterurl


In [29]:
def display_recommendation_poster(myinput):
    imdbidList, input_posterurl = recommendation(myinput)

    # print the input movie poster first
    print('Input Moive: %s' %myinput)
    input_image = "<img style='width: 110px; margin: 0px; float: left; border: 1px solid black;' src='%s' />"% input_posterurl
    display(HTML(input_image))
    
    # then print the recommendations
    rec_movie_url = [getPoster(i) for i in imdbidList] 
    images = ''
    print("Recommendations:")
    for i in range(len(imdbidList)):
        images += "<img style='width: 110px; margin: 0px; float: left; border: 1px solid black;' src='%s' />"% rec_movie_url[i]
    display(HTML(images))

In [91]:
# randomly create a list of movies 
movieList = ['Tron', 'Toy Story', 'Forest Gump', 'Batman', 'Superman']
movieList

['Tron', 'Toy Story', 'Forest Gump', 'Batman', 'Superman']

In [92]:
display_recommendation_poster(movieList[0])

Input Moive: Tron


Recommendations:


In [93]:
display_recommendation_poster(movieList[1])

Input Moive: Toy Story


Recommendations:


In [94]:
display_recommendation_poster(movieList[2])

Input Moive: Forest Gump


Recommendations:


In [95]:
display_recommendation_poster(movieList[3])

Input Moive: Batman


Recommendations:


In [96]:
display_recommendation_poster(movieList[4])

Input Moive: Superman


Recommendations:
