In [9]:
import numpy as np
import pandas as pd
import re, requests, json, os, itertools

open_api_key = '94f4c06'
tmdb_api_key = '31f0da8fbaf03a2e236153216f870b8a'

In [2]:
# Load movielens movie dataset
df_movielens = pd.DataFrame.from_csv('data/ml-latest/movies.csv', encoding="ISO-8859-1")
df_movielens

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [3]:
def parseTitleAndYear(t):
    # Return correct movie title and year
    title_parser = re.compile(r'([a-zA-Z:Ã©¡³º¨±¶€¤\´\' 0-9,-]+)')
    article_parser = re.compile(r'.+, ([a-zA-Z]+$)')
    year_parser = re.compile(r'.+\((\d*)\)$')
    unicodes = {"Ã´":"ô","Ã¡":"á","Ã©":"é","Ã³":"ó","Ãº":"ú","Ã¨":"è","Ã±":"ñ","Ã¶":"ö","Ã€":"À","Ã¤":"ä","Ã":"í"}
    replaces = re.compile('|'.join(unicodes.keys()))
    
    raw_title = title_parser.match(t)
    if raw_title is None:
        title = ''
    else:
        title = raw_title.group(1).strip()
        article = article_parser.match(title)
        if article is not None:
            article = article.group(1).strip()
            title = article + " " + title[:len(title)-2-len(article)]
        title = replaces.sub(lambda x: unicodes[x.group()], title)
        title = title.replace(" ","%20")
        if len(title) > 0:
            title = "&t=" + title
    
    raw_year = year_parser.match(t)
    if raw_year is None:
        year = ''
    else:
        year = raw_year.group(1).strip().replace(" ","%20")
        if len(year) > 0:
            year = "&y=" + year
    return (title,year)

def getIMDbID(c):
    # Return IMDb ID
    parsed = parseTitleAndYear(c)
    api_call = "http://www.omdbapi.com/?apikey="+open_api_key + parsed[0] + parsed[1]
    r = requests.get(api_call)
    if r.status_code == 200:
            r = json.loads(r.text)
            if r["Response"] == "True":
                return r['imdbID']
    return np.nan

In [4]:
# Build/Load (ML ID - IMDb ID) dataset
if os.path.isfile('backup/df_ml_imdbid.csv'):
    with open('backup/df_ml_imdbid.csv','r') as df_ml_imdbid_file:
        df_ml_imdbid = pd.DataFrame.from_csv(df_ml_imdbid_file, encoding="ISO-8859-1")
        
else:
    df_movielens['imdbid'] = df_movielens['title'].map(lambda t : getIMDbID(t))
    df_ml_imdbid = df_movielens.drop(['title','genres'], axis=1)
    df_ml_imdbid.to_csv('backup/df_ml_imdbid.csv', encoding="utf-8")
df_ml_imdbid

Unnamed: 0_level_0,imdbid
movieId,Unnamed: 1_level_1
1,tt0114709
2,tt0113497


In [5]:
parseTitleAndYear('Ace Ventura: When Nature Calls, An (1995)')

('&t=An%20Ace%20Ventura:%20When%20Nature%20Calls', '&y=1995')

In [6]:
# Build/Load TMDb movie json
if os.path.isfile('backup/movie_data.json'):
    with open('backup/movie_data.json','r') as movie_data_file:
        movie_data = json.load(movie_data_file)
        
else:
    movie_data = {str(df_ml_imdbid.iloc[i]["imdbid"]) : 
         requests.get('https://api.themoviedb.org/3/movie/'+ str(df_ml_imdbid.iloc[i]["imdbid"]) +
                      '?api_key='+ tmdb_api_key+'&language=en-US&append_to_response=keywords').json()
         for i in range(df_ml_imdbid.shape[0])}
    with open('backup/movie_data.json',"w") as movie_data_file:
            json.dump(movie_data, movie_data_file)
movie_data

{'tt0113497': {'adult': False,
  'backdrop_path': '/7k4zEgUZbzMHawDaMc9yIkmY1qR.jpg',
  'belongs_to_collection': {'backdrop_path': '/dsaUpJoxSqYJh96s5Z9hHzoRKQ3.jpg',
   'id': 495527,
   'name': 'Jumanji Collection',
   'poster_path': '/7E2DerSdIVeh1vHwkV1J56Kto3x.jpg'},
  'budget': 65000000,
  'genres': [{'id': 12, 'name': 'Adventure'},
   {'id': 14, 'name': 'Fantasy'},
   {'id': 10751, 'name': 'Family'}],
  'homepage': '',
  'id': 8844,
  'imdb_id': 'tt0113497',
  'keywords': {'keywords': [{'id': 10090, 'name': 'board game'},
    {'id': 10941, 'name': 'disappearance'},
    {'id': 15101, 'name': "based on children's book"},
    {'id': 33467, 'name': 'new home'},
    {'id': 158086, 'name': 'recluse'},
    {'id': 158091, 'name': 'giant insect'}]},
  'original_language': 'en',
  'original_title': 'Jumanji',
  'overview': "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside

In [28]:
# Generate movie dataset from json
df_movies = pd.DataFrame.from_dict(movie_data, orient="index")[['title','genres','keywords','overview']]

df_movies['keywords'] = df_movies['keywords'].map(lambda d : list(map(lambda k : k['name'], d['keywords'])))
df_movies['genres'] = df_movies['genres'].map(lambda l : list(map(lambda g : g['name'], l)))
df_movies.to_csv('backup/df_movies.csv')
df_movies

Unnamed: 0,title,genres,keywords,overview
tt0113497,Jumanji,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...",When siblings Judy and Peter discover an encha...
tt0114709,Toy Story,"[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","Led by Woody, Andy's toys live happily in his ..."


In [16]:
# Build/Load TMDb people json
if os.path.isfile('backup/people_data.json'):
    with open('backup/people_data.json','r') as people_data_file:
        people_data = json.load(people_data_file)
        
else:
    people_data = {str(df_ml_imdbid.iloc[i]["imdbid"]) : 
         requests.get('https://api.themoviedb.org/3/movie/'+ str(df_ml_imdbid.iloc[i]["imdbid"]) + '/credits' +
                      '?api_key='+ tmdb_api_key+'&language=en-US&append_to_response=keywords').json()
         for i in range(df_ml_imdbid.shape[0])}
    with open('backup/people_data.json',"w") as people_data_file:
            json.dump(people_data, people_data_file)
people_data

{'tt0113497': {'cast': [{'cast_id': 1,
    'character': 'Alan Parrish',
    'credit_id': '52fe44bfc3a36847f80a7c73',
    'gender': 2,
    'id': 2157,
    'name': 'Robin Williams',
    'order': 0,
    'profile_path': '/5KebSMXT8uj2D0gkaMFJ8VEp53.jpg'},
   {'cast_id': 8,
    'character': 'Samuel Alan Parrish / Van Pelt',
    'credit_id': '52fe44bfc3a36847f80a7c99',
    'gender': 2,
    'id': 8537,
    'name': 'Jonathan Hyde',
    'order': 1,
    'profile_path': '/7il5D76vx6QVRVlpVvBPEC40MBi.jpg'},
   {'cast_id': 2,
    'character': 'Judy Shepherd',
    'credit_id': '52fe44bfc3a36847f80a7c77',
    'gender': 1,
    'id': 205,
    'name': 'Kirsten Dunst',
    'order': 2,
    'profile_path': '/wBXvh6PJd0IUVNpvatPC1kzuHtm.jpg'},
   {'cast_id': 24,
    'character': 'Peter Shepherd',
    'credit_id': '52fe44c0c3a36847f80a7ce7',
    'gender': 0,
    'id': 145151,
    'name': 'Bradley Pierce',
    'order': 3,
    'profile_path': '/j6iW0vVA23GQniAPSYI6mi4hiEW.jpg'},
   {'cast_id': 10,
    'charact

In [19]:
def get_directors(movie_id):
    # Return movie directors
    directors = filter(lambda d: d['job'] == 'Director', people_data[df_movies.loc[movie_id].name]['crew'])
    return [(movie_id, d["id"], d["name"], d["gender"]) for d in directors]

def get_cast(movie_id):
    # Return movie main actors
    cast = people_data[df_movies.loc[movie_id].name]['cast'][:3]
    return [(movie_id, i["id"], i["name"], i["gender"]) for i in cast]

In [21]:
# Build lists for directors and cast
directors = [get_directors(i) for i in df_movies.index.values]
cast = [get_cast(i) for i in df_movies.index.values]

directors = list(itertools.chain.from_iterable(directors))
cast = list(itertools.chain.from_iterable(cast))

In [27]:
df_directors = pd.DataFrame.from_records(directors, columns = ["movie_id", "id", "name", "gender"], index = "movie_id")
df_directors["rol"] = "director"

df_cast = pd.DataFrame.from_records(cast, columns = ["movie_id", "id", "name", "gender"], index = "movie_id")
df_cast["rol"] = "actor"

df_people = df_directors.append(df_cast)
df_people.to_csv('backup/df_people.csv')

In [26]:
df_people

Unnamed: 0_level_0,id,name,gender,rol
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0113497,4945,Joe Johnston,2,director
tt0114709,7879,John Lasseter,2,director
tt0113497,2157,Robin Williams,2,actor
tt0113497,8537,Jonathan Hyde,2,actor
tt0113497,205,Kirsten Dunst,1,actor
tt0114709,31,Tom Hanks,2,actor
tt0114709,12898,Tim Allen,2,actor
tt0114709,7167,Don Rickles,2,actor
