In [210]:
import numpy as np
import pandas as pd
import re, requests, json, os

open_api_key = '94f4c06'
tmdb_api_key = '31f0da8fbaf03a2e236153216f870b8a'

In [205]:
# Load movielens movie dataset
df_movielens = pd.DataFrame.from_csv('data/ml-latest/movies.csv', encoding="ISO-8859-1")
df_movielens

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [167]:
def parseTitleAndYear(t):
    # Return correct movie title and year
    title_parser = re.compile(r'([a-zA-Z:Ã©¡³º¨±¶€¤\´\' 0-9,-]+)')
    article_parser = re.compile(r'.+, ([a-zA-Z]+$)')
    year_parser = re.compile(r'.+\((\d*)\)$')
    unicodes = {"Ã´":"ô","Ã¡":"á","Ã©":"é","Ã³":"ó","Ãº":"ú","Ã¨":"è","Ã±":"ñ","Ã¶":"ö","Ã€":"À","Ã¤":"ä","Ã":"í"}
    replaces = re.compile('|'.join(unicodes.keys()))
    
    raw_title = title_parser.match(t)
    if raw_title is None:
        title = ''
    else:
        title = raw_title.group(1).strip()
        article = article_parser.match(title)
        if article is not None:
            article = article.group(1).strip()
            title = article + " " + title[:len(title)-2-len(article)]
        title = replaces.sub(lambda x: unicodes[x.group()], title)
        title = title.replace(" ","%20")
        if len(title) > 0:
            title = "&t=" + title
    
    raw_year = year_parser.match(t)
    if raw_year is None:
        year = ''
    else:
        year = raw_year.group(1).strip().replace(" ","%20")
        if len(year) > 0:
            year = "&y=" + year
    return (title,year)

def getIMDbID(c):
    # Return IMDb ID
    parsed = parseTitleAndYear(c)
    api_call = "http://www.omdbapi.com/?apikey="+open_api_key + parsed[0] + parsed[1]
    r = requests.get(api_call)
    if r.status_code == 200:
            r = json.loads(r.text)
            if r["Response"] == "True":
                return r['imdbID']
    return np.nan

In [224]:
# Build/Load (ML ID - IMDb ID) dataset
if os.path.isfile('backup/df_ml_imdbid.csv'):
    with open('backup/df_ml_imdbid.csv','r') as df_ml_imdbid_file:
        df_ml_imdbid = pd.DataFrame.from_csv(df_ml_imdbid_file, encoding="ISO-8859-1")
        
# Si no, obtiene los datos
else:
    df_movielens['imdbid'] = df_movielens['title'].map(lambda t : getIMDbID(t))
    df_ml_imdbid = df_movielens.drop(['title','genres'], axis=1)
    df_ml_imdbid.to_csv('backup/df_ml_imdbid.csv', encoding="utf-8")
df_ml_imdbid

Unnamed: 0_level_0,imdbid
movieId,Unnamed: 1_level_1
1,tt0114709
2,tt0113497


In [278]:
parseTitleAndYear('Ace Ventura: When Nature Calls, An (1995)')

('&t=An%20Ace%20Ventura:%20When%20Nature%20Calls', '&y=1995')

In [240]:
# Build/Load TMDb movie json
if os.path.isfile('backup/movie_data.json'):
    with open('backup/movie_data.json','r') as movie_data_file:
        movie_data = json.load(movie_data_file)
        
else:
    movie_data = {str(df_ml_imdbid.iloc[i]["imdbid"]) : 
         requests.get('https://api.themoviedb.org/3/movie/'+ str(df_ml_imdbid.iloc[i]["imdbid"]) +
                      '?api_key='+ tmdb_api_key+'&language=en-US&append_to_response=keywords').json()
         for i in range(df_ml_imdbid.shape[0])}
    with open('backup/movie_data.json',"w") as movie_data_file:
            json.dump(movie_data, movie_data_file)
movie_data

{'tt0113497': {'adult': False,
  'backdrop_path': '/7k4zEgUZbzMHawDaMc9yIkmY1qR.jpg',
  'belongs_to_collection': {'backdrop_path': '/dsaUpJoxSqYJh96s5Z9hHzoRKQ3.jpg',
   'id': 495527,
   'name': 'Jumanji Collection',
   'poster_path': '/7E2DerSdIVeh1vHwkV1J56Kto3x.jpg'},
  'budget': 65000000,
  'genres': [{'id': 12, 'name': 'Adventure'},
   {'id': 14, 'name': 'Fantasy'},
   {'id': 10751, 'name': 'Family'}],
  'homepage': '',
  'id': 8844,
  'imdb_id': 'tt0113497',
  'keywords': {'keywords': [{'id': 10090, 'name': 'board game'},
    {'id': 10941, 'name': 'disappearance'},
    {'id': 15101, 'name': "based on children's book"},
    {'id': 33467, 'name': 'new home'},
    {'id': 158086, 'name': 'recluse'},
    {'id': 158091, 'name': 'giant insect'}]},
  'original_language': 'en',
  'original_title': 'Jumanji',
  'overview': "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside

In [279]:
# Generate movie dataset from json
df_movie_data = pd.DataFrame.from_dict(movie_data, orient="index")[['title','genres','keywords','overview']]

df_movie_data['keywords'] = df_movie_data['keywords'].map(lambda d : list(map(lambda k : k['name'], d['keywords'])))
df_movie_data['genres'] = df_movie_data['genres'].map(lambda l : list(map(lambda g : g['name'], l)))
df_movie_data

Unnamed: 0,title,genres,keywords,overview
tt0113497,Jumanji,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...",When siblings Judy and Peter discover an encha...
tt0114709,Toy Story,"[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","Led by Woody, Andy's toys live happily in his ..."


In [None]:
# Build/Load TMDb people json
if os.path.isfile('backup/people_data.json'):
    with open('backup/people_data.json','r') as people_data_file:
        people_data = json.load(people_data_file)
        
else:
    people_data = {str(df_ml_imdbid.iloc[i]["imdbid"]) : 
         requests.get('https://api.themoviedb.org/3/movie/'+ str(df_ml_imdbid.iloc[i]["imdbid"]) + '/credits' +
                      '?api_key='+ tmdb_api_key+'&language=en-US&append_to_response=keywords').json()
         for i in range(df_ml_imdbid.shape[0])}
    with open('backup/people_data.json',"w") as people_data_file:
            json.dump(people_data, people_data_file)
people_data