# Data

In [1]:
import os

import urllib.request
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
from IPython.display import display, HTML, Image

### For img scraping
from lib.utility import tmdb_img_download,display_images,cosine_matrix

### For deep learning feature extraction
from lib.extractor import ImgExtractor

from IPython.display import display

from scipy import sparse
import pyspark
from pyspark.sql import SQLContext
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import multiprocessing
from multiprocessing import Process, Manager

  from ._conv import register_converters as _register_converters


### Rating data

In [2]:
"""movie rating data: Only run it one time""" 

ratingUrl = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
urllib.request.urlretrieve(ratingUrl,"data/ml-latest-small.zip")
os.system("unzip -a -n data/ml-latest-small.zip -d data/")
os.listdir("data/")

['.DS_Store', 'ml-latest-small.zip', 'ml-latest-small', 'poster']

### Poster data

In [3]:
### Poster data
id = 505
tmdbPath = 'http://image.tmdb.org/t/p/w185/' # check https://developers.themoviedb.org/3/getting-started/image-languages

key_v3 = '10f40f0d5d794e4bacb266188128a896'
tmdb_connector = tmdb
tmdb_connector.API_KEY = key_v3

movieInfo = tmdb.Movies(id).info()
# print(movieInfo)
posterPath = movieInfo['poster_path']
fullPath = tmdbPath + posterPath
# print(fullPath)
# display_images(id=600,path_db=path_db)

In [4]:
# os.path.join(os.getcwd(),"data/poster/{}.jpg".format(id))
urllib.request.urlretrieve(fullPath, "data/poster/{}.jpg".format(id))

('data/poster/505.jpg', <http.client.HTTPMessage at 0x10cd86198>)

### Show data

In [5]:
images = ''

images += "<img style='width: 100px; margin: 0px; \
        float: left; border: 1px solid black;' src='%s' />" \
        % fullPath

display(HTML(images))

# Part 1: Recommendation based on movie rating data
#### Tools: Spark, ALS model

In [6]:
rating_df = pd.read_csv('data/ml-latest-small/ratings.csv')
linkes_df = pd.read_csv('data/ml-latest-small/links.csv')

df_merged = pd.merge(rating_df,linkes_df,on=['movieId'])
df_merged.dropna(how="any",inplace=True)
df_merged['tmdbId'] = [int(x) for x in df_merged.tmdbId]
print(df_merged.head())
print(df_merged.shape)

   userId  movieId  rating   timestamp  imdbId  tmdbId
0       1        1     4.0   964982703  114709     862
1       5        1     4.0   847434962  114709     862
2       7        1     4.5  1106635946  114709     862
3      15        1     2.5  1510577970  114709     862
4      17        1     4.5  1305696483  114709     862
(100823, 6)


In [7]:
len(df_merged.tmdbId.unique())

9715

In [8]:
"""Generate Rating matrix""" 
df_rating = df_merged[['userId','tmdbId','rating']]
df_rating.head()

Unnamed: 0,userId,tmdbId,rating
0,1,862,4.0
1,5,862,4.0
2,7,862,4.5
3,15,862,2.5
4,17,862,4.5


In [9]:
"""Create SparkContext"""
sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [10]:
"""Create Spark dataframe"""
df_spark = sqlContext.createDataFrame(df_rating)
df_spark.printSchema()

root
 |-- userId: long (nullable = true)
 |-- tmdbId: long (nullable = true)
 |-- rating: double (nullable = true)



In [11]:
df_spark.head(5)

[Row(userId=1, tmdbId=862, rating=4.0),
 Row(userId=5, tmdbId=862, rating=4.0),
 Row(userId=7, tmdbId=862, rating=4.5),
 Row(userId=15, tmdbId=862, rating=2.5),
 Row(userId=17, tmdbId=862, rating=4.5)]

In [12]:
"""Train ALS model """
rank = 5
numIterations = 10
model = ALS.train(df_spark, rank, numIterations)

### ALS: One of Collabrative filtering 

#1 User features S

#2 bridge        V

#3 Product       D

In [13]:
"""Recommendation based on Users & Products"""
display('features for product one', model.productFeatures().first())
display('features for user one',model.userFeatures().first())

# For Product X, Find N Users to Sell To
productForUser = model.recommendUsers(242,10)
print("Top 10 moive recommendations for user 242: \n")
display(productForUser)

# For User Y Find N Products to Promote
userForProduct = model.recommendProducts(196,10)
print("Top 10 users to recommend for product 196")
display(userForProduct)

# Predict ratings
ratingPredict= model.predict(196,242)
print("prediction of rate for User 196 --> movie 242")
display(ratingPredict)

productFeatures = model.productFeatures()

'features for product one'

(16,
 array('d', [3.310121536254883, 0.4654504656791687, 1.167807698249817, 0.4197458028793335, -0.7567285895347595]))

'features for user one'

(8,
 array('d', [0.3368377387523651, 1.1229897737503052, 0.06548604369163513, -1.1799126863479614, -0.10284101963043213]))

Top 10 moive recommendations for user 242: 



[Rating(user=197, product=242, rating=5.304480973911692),
 Rating(user=543, product=242, rating=5.281337931425638),
 Rating(user=53, product=242, rating=5.219841377152572),
 Rating(user=276, product=242, rating=5.0326547707896285),
 Rating(user=130, product=242, rating=4.969749880493975),
 Rating(user=413, product=242, rating=4.961759087180705),
 Rating(user=494, product=242, rating=4.934886006319921),
 Rating(user=92, product=242, rating=4.895501165461492),
 Rating(user=162, product=242, rating=4.894090989016076),
 Rating(user=127, product=242, rating=4.888482917438072)]

Top 10 users to recommend for product 196


[Rating(user=196, product=62215, rating=15.389093776936207),
 Rating(user=196, product=11337, rating=13.504014434024938),
 Rating(user=196, product=10937, rating=13.38470019294921),
 Rating(user=196, product=8766, rating=12.841970982151164),
 Rating(user=196, product=81796, rating=12.77287059602358),
 Rating(user=196, product=26317, rating=12.085729238941493),
 Rating(user=196, product=17745, rating=11.594710376727903),
 Rating(user=196, product=179144, rating=11.539015361647948),
 Rating(user=196, product=24266, rating=11.286366718534119),
 Rating(user=196, product=10646, rating=11.036652072424218)]

prediction of rate for User 196 --> movie 242


4.067075144922935

### Evaluation

In [14]:
"""Get Mean Square Error"""
# True:4.6 pred:5, cost: 0.4**2

# Background: ALS: 1 sparse matrix, 2 only a few rating 3 based on few rating --> get all rating
# Sparse matrix: row:user, column: movie id, eg: 600 user, 100,00 movie rating, 4000 rating (ground truth), 6))*100,000   
# Train process: 1. calculate all rating. 

# Interview: Decomposed based, decompose to user feature, project feature & transformation matrix

'Get Mean Square Error'

# Recommendation based on Posters (CNN)

In [15]:
"""Download poster data for tmdb movies"""

tmdbIds = df_spark.select('tmdbId').distinct().collect()
tmdbIdsPool = set([x.asDict()['tmdbId'] for x in tmdbIds])

In [16]:
len(tmdbIdsPool)

9715

In [18]:
def helper(id):
    tmdbPath = 'http://image.tmdb.org/t/p/w185/' # check https://developers.themoviedb.org/3/getting-started/image-languages

    movieInfo = tmdb.Movies(id).info()
    # print(movieInfo)
    posterPath = movieInfo['poster_path']
    fullPath = tmdbPath + posterPath
    urllib.request.urlretrieve(fullPath, "data/poster/{}.jpg".format(id))
    print(id,"downloaded successfully")

In [None]:
notAvailable = []
for id in tmdbIdsPool:
    try:
        tmpPath = helper(id)
    except:
        notAvailable.append(id)
        print(id)

In [19]:
"""Scrape movie poster from restful API with multiprocess"""
with ThreadPoolExecutor(max_workers=10) as ex:
    ex.map(lambda x: helper(x),tmdbIdsPool)

14 downloaded successfully
12 downloaded successfully
5 downloaded successfully
16 downloaded successfully
2 downloaded successfully
15 downloaded successfully
13 downloaded successfully
11 downloaded successfully
6 downloaded successfully
65550 downloaded successfully
18 downloaded successfully
19 downloaded successfully
24 downloaded successfully
20 downloaded successfully
25 downloaded successfully
21 downloaded successfully
33 downloaded successfully
28 downloaded successfully
3522  downloaded successfully
downloaded successfully
38 downloaded successfully
196649 downloaded successfully
294963 downloaded successfully
58 downloaded successfully
32823 downloaded successfully
55 downloaded successfully
65595 downloaded successfully
63 downloaded successfully
62 downloaded successfully
59 downloaded successfully
64 downloaded successfully
65 downloaded successfully
32836 downloaded successfully
67 downloaded successfully
66 downloaded successfully
69 downloaded successfully
68 download

16771 downloaded successfully
16780 downloaded successfully
49538 downloaded successfully
278927 downloaded successfully
16784 downloaded successfully
16806 downloaded successfully
49565 downloaded successfully
16820 downloaded successfully
278990 downloaded successfully
16858 downloaded successfully
16859 downloaded successfully
16857 downloaded successfully
16804 downloaded successfully
16866 downloaded successfully
49636 downloaded successfully
16869 downloaded successfully
16885 downloaded successfully
16871 downloaded successfully
16876 downloaded successfully
74998 downloaded successfully
16889 downloaded successfully
16888 downloaded successfully
115199 downloaded successfully
16905 downloaded successfully
49680 downloaded successfully
16933 downloaded successfully
49689 downloaded successfully
16938 downloaded successfully
16934 downloaded successfully
49721 downloaded successfully
16939 downloaded successfully
16909 downloaded successfully
49730 downloaded successfully
16958 d

In [23]:
posterIds = os.listdir('data/poster/')

extractor = ImgExtractor(model="VGG16")

In [25]:
len(posterIds)

400

In [27]:
features = extractor.get_features("data/poster/505.jpg")

load Image locally


In [29]:
features.shape

(1, 1000)

### Compare similarity
#### Cosine similarity

In [31]:
# you like one poster, find the most similar poster

In [4]:
"""Define function to extract features from folder"""
def helper(id,posterPath):
    pwd = os.getcwd()
    path = os.path.join(pwd,posterPath,str(id))
    filename = os.listdir(path)[0]
    imgPath = os.path.join(path,filename)
    feature = extractor.get_features(imgPath)
    print("Load Image {} success".format(id))
    return feature

In [None]:
dicFeatures = {}
for id in posterIds:
    dicFeatures[id] = helper(id,posterPath="data/poster")