### Part 1. Scrapping movie and actor data from TMDB API

In [1]:
import requests
import json
from datetime import datetime
import csv

In [2]:
# movie list
with open('movie_list.txt', encoding='utf8') as file:
    lines = file.readlines()
movie_list = [l.strip() for l in lines]
movie_list = list(set(movie_list))
print(movie_list)
print(len(movie_list))

['Jumanji: Welcome to the Jungle', 'Alice in Wonderland', 'The Croods', "Ocean's Eleven", "The Sorcerer's Apprentice", 'Passengers', 'I Am Legend', 'Pirates of the Caribbean: The Curse of the Black Pearl', 'Furious 7', 'The Iron Giant', 'The Matrix', 'Lara Croft: Tomb Raider', 'The Da Vinci Code', 'Suicide Squad', 'Horrible Bosses', 'Ghost Rider', 'Django Unchained', 'Inglourious Basterds', 'Her', 'Once Upon a Time… in Hollywood', 'Star Wars: Episode I - The Phantom Menace', 'National Treasure', 'War of the Worlds', 'Men in Black', 'Greyhound', 'Marriage Story', 'Bruce Almighty', 'Edward Scissorhands', 'Ghost World', 'The Perks of Being a Wallflower', "We're the Millers", 'The Amazing Spider-Man', 'The Pursuit of Happyness', 'Red Notice', 'The Bling Ring', 'Looper', 'The Departed', 'Se7en', 'The Green Mile', 'Kick-Ass', 'John Wick', 'Léon: The Professional', 'World War Z', 'Unbreakable', 'Mission: Impossible - Ghost Protocol', 'Just Go with It', 'Oblivion', 'The Hunger Games', 'Marley 

In [3]:
# actor list
actor_list = ["Brad Pitt", "Samuel L. Jackson", "Tom Hanks", "Tom Cruise", "Johnny Depp", "Will Smith", "Dwayne Johnson",
              "Matt Damon", "Bruce Willis", "Nicolas Cage", "Keanu Reeves", "Scarlett Johansson", "Jennifer Aniston", 
              "Angelina Jolie", "Emma Watson", "Jennifer Lawrence", "Emma Stone", "Natalie Portman"]

search_people_dir = "https://api.themoviedb.org/3/search/person?query="
get_people_dir = "https://api.themoviedb.org/3/person/"
api_dir = "api_key=******************************&language=en-US"

# scrap actor info
count = 0
actor_info = []
movie_ids = []
for actor in actor_list:
    search_dir = search_people_dir + actor + "&" + api_dir
    response1 = requests.get(search_dir).json()
    print(response1['results'][0]['id'])
    get_dir = get_people_dir + str(response1['results'][0]['id']) + "?" + api_dir
    response2 = requests.get(get_dir).json()
    response2['actor_id'] = response2['id']
    response2.pop('also_known_as', None)
    response2['features'] = []
    print(response2['name'])
    get_credit_dir = get_people_dir + str(response1['results'][0]['id']) + "/movie_credits?" + api_dir
    response3 = requests.get(get_credit_dir).json()
    for m in response3['cast']:
        if m['title'] in movie_list:
            print(m['title'])
            if m['id'] not in movie_ids:
                movie_ids.append(m['id'])
            response2['features'].append({'title': m['title'], 'character': m['character'], 'movie_id': m['id']})
            count += 1
    actor_info.append(response2)
print(count)

287
Brad Pitt
Mr. & Mrs. Smith
The Curious Case of Benjamin Button
World War Z
Fury
Inglourious Basterds
Ocean's Eleven
Fight Club
Se7en
Once Upon a Time… in Hollywood
2231
Samuel L. Jackson
Pulp Fiction
Unbreakable
Glass
Django Unchained
The Avengers
Star Wars: Episode I - The Phantom Menace
Inglourious Basterds
Iron Man
31
Tom Hanks
Forrest Gump
The Da Vinci Code
The Green Mile
Saving Private Ryan
Greyhound
Catch Me If You Can
500
Tom Cruise
War of the Worlds
Minority Report
Mission: Impossible
Mission: Impossible - Ghost Protocol
Oblivion
Mission: Impossible - Rogue Nation
Edge of Tomorrow
85
Johnny Depp
Charlie and the Chocolate Factory
Edward Scissorhands
Pirates of the Caribbean: The Curse of the Black Pearl
Alice in Wonderland
Fantastic Beasts and Where to Find Them
2888
Will Smith
The Pursuit of Happyness
I, Robot
I Am Legend
Suicide Squad
Men in Black
Aladdin
18918
Dwayne Johnson
Baywatch
San Andreas
Jumanji: Welcome to the Jungle
Red Notice
Fast & Furious 6
Furious 7
1892
Mat

In [4]:
# scrap movie info
get_movie_dir = "https://api.themoviedb.org/3/movie/"
movie_info = []
for mid in movie_ids:
    get_dir = get_movie_dir + str(mid) + "?" + api_dir
    response = requests.get(get_dir).json()
    response['movie_id'] = response['id']
    movie_info.append(response)
print(len(movie_info))

90


In [5]:
#actor and movie example data
print(actor_info[0])
print()
print(movie_info[0])

{'adult': False, 'biography': "William Bradley Pitt is an American actor and film producer. He has received multiple awards, including two Golden Globe Awards and an Academy Award for his acting, in addition to another Academy Award and a Primetime Emmy Award as producer under his production company, Plan B Entertainment.\n\nPitt first gained recognition as a cowboy hitchhiker in the road movie Thelma & Louise (1991). His first leading roles in big-budget productions came with the drama films A River Runs Through It (1992) and Legends of the Fall (1994), and the horror film Interview with the Vampire (1994). He gave critically acclaimed performances in the crime thriller Seven (1995) and the science fiction film 12 Monkeys (1995), the latter earning him a Golden Globe Award for Best Supporting Actor and an Academy Award nomination.\n\nHe starred in Fight Club (1999) and the heist film Ocean's Eleven (2001), as well as its sequels, Ocean's Twelve (2004) and Ocean's Thirteen (2007). His 

In [18]:
# movie genre dict
genre_dict = {}
for movie in movie_info:
    genre_l = []
    for g in movie['genres']:
        genre_l.append(g['name'])
    genre_dict[movie['id']] = genre_l
genre_dict[787]

['Action', 'Comedy', 'Drama', 'Thriller']

### Part 2. Load Data into DynamoDB Table

In [6]:
import boto3
from boto3.dynamodb.conditions import Key
import time
from decimal import Decimal

In [7]:
# get DynamoDB resource
# I work this code in local, so I need access key and secret key
# sorry, I can't put this information in the submitted code
dynamodb = boto3.resource('dynamodb', aws_access_key_id='********', 
                          aws_secret_access_key='********',
                          region_name="us-west-2")
list(dynamodb.tables.all())

[dynamodb.Table(name='actors'),
 dynamodb.Table(name='movies'),
 dynamodb.Table(name='yelp-restaurants')]

In [8]:
# put data to actor table in DynamoDB
actor_table = dynamodb.Table('actors')

for actor in actor_info:
    ddb_data = json.loads(json.dumps(actor), parse_float=Decimal)

    actor_table.put_item(
        Item=ddb_data
    )

In [9]:
# try a potential query code
response = actor_table.query(
    KeyConditionExpression=Key('actor_id').eq(6384)
)
print(response['Items'][0])

{'deathday': None, 'adult': False, 'actor_id': Decimal('6384'), 'profile_path': '/rRdru6REr9i3WIHv2mntpcgxnoY.jpg', 'imdb_id': 'nm0000206', 'place_of_birth': 'Beirut, Lebanon', 'name': 'Keanu Reeves', 'gender': Decimal('2'), 'features': [{'title': 'Constantine', 'movie_id': Decimal('561'), 'character': 'John Constantine'}, {'title': 'John Wick', 'movie_id': Decimal('245891'), 'character': 'Jonathan "John" Wick'}, {'title': 'The Matrix', 'movie_id': Decimal('603'), 'character': 'Thomas A. Anderson / Neo'}], 'biography': 'Keanu Charles Reeves is a Canadian actor. Reeves is known for his roles in Bill & Ted\'s Excellent Adventure, Speed, Point Break, and The Matrix trilogy as Neo. He has collaborated with major directors such as Stephen Frears (in the 1988 period drama Dangerous Liaisons); Gus Van Sant (in the 1991 independent film My Own Private Idaho); and Bernardo Bertolucci (in the 1993 film Little Buddha). Referring to his 1991 film releases, The New York Times\' critic, Janet Maslin

In [10]:
result = actor_table.get_item(Key={'actor_id': 6384})['Item']
result['name']

'Keanu Reeves'

In [11]:
# put data to movie table in DynamoDB
movie_table = dynamodb.Table('movies')

for movie in movie_info:
    ddb_data = json.loads(json.dumps(movie), parse_float=Decimal)

    movie_table.put_item(
        Item=ddb_data
    )

In [12]:
# try a potential query code
response = movie_table.query(
    KeyConditionExpression=Key('movie_id').eq(954)
)
print(response['Items'][0])

{'movie_id': Decimal('954'), 'original_title': 'Mission: Impossible', 'belongs_to_collection': {'name': 'Mission: Impossible Collection', 'backdrop_path': '/jYl0UuJFcmhymv9ZNO14lPLDY1Z.jpg', 'id': Decimal('87359'), 'poster_path': '/geEjCGfdmRAA1skBPwojcdvnZ8A.jpg'}, 'budget': Decimal('80000000'), 'spoken_languages': [{'name': 'English', 'iso_639_1': 'en', 'english_name': 'English'}, {'name': 'Français', 'iso_639_1': 'fr', 'english_name': 'French'}, {'name': 'Český', 'iso_639_1': 'cs', 'english_name': 'Czech'}], 'status': 'Released', 'imdb_id': 'tt0117060', 'vote_average': Decimal('6.9'), 'backdrop_path': '/pbaAkR1FDvgndTVFgGRIzf9o49r.jpg', 'revenue': Decimal('457731198'), 'id': Decimal('954'), 'poster_path': '/lirMXnE7NoVLDr3qeqf48fR1mk4.jpg', 'production_countries': [{'iso_3166_1': 'US', 'name': 'United States of America'}], 'adult': False, 'runtime': Decimal('110'), 'overview': "When Ethan Hunt, the leader of a crack espionage team whose perilous operation has gone awry with no expla

### Part 3. Format Data and Upload to OpenSearch

In [13]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests.auth import HTTPBasicAuth

In [19]:
# put indices in Opensearch
count = 0
for actor in actor_info:
    for f in actor['features']:
        index_json = {
            "actor_id": actor['actor_id'],
            "actor_name": actor['name'],
            "movie_id": f['movie_id'],
            "movie_title": f['title'],
            "movie_character": f['character'],
            "movie_genre": genre_dict[f['movie_id']]
        }
        count += 1
        # print(index_json)
        host = 'https://search-movies-zhkkpomplgb457xjek35mxp5dy.us-west-2.es.amazonaws.com'
        index = 'movies'
        url = str(host + '/' + index + '/' + '_doc')
        headers = {'Content-Type': "application/json", 'Accept': "application/json"}
        r = requests.post(url, json=index_json, headers=headers, auth = HTTPBasicAuth('******', '******'))
        print(r.text)
print(count)

{"_index":"movies","_type":"_doc","_id":"eZsj2n0BoAqYEq1IMmr8","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"epsj2n0BoAqYEq1IN2oB","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"e5sj2n0BoAqYEq1IOGow","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"fJsj2n0BoAqYEq1IOWp1","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"fZsj2n0BoAqYEq1IPGor","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"fpsj2n0BoAqYEq1IPWrc","_version":1,"result":"created","_shards":{"total":2,"successful":1

{"_index":"movies","_type":"_doc","_id":"qZsj2n0BoAqYEq1Ib2oS","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":14,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"qpsj2n0BoAqYEq1IcGo0","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":8,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"q5sj2n0BoAqYEq1IcWpS","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":4,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"rJsj2n0BoAqYEq1Icmp9","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":15,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"rZsj2n0BoAqYEq1Ic2rQ","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":5,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"rpsj2n0BoAqYEq1IdGry","_version":1,"result":"created","_shards":{"total":2,"successful"

{"_index":"movies","_type":"_doc","_id":"2Jsj2n0BoAqYEq1Ipmoh","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":14,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"2Zsj2n0BoAqYEq1Ip2pv","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":13,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"2psj2n0BoAqYEq1IqGqa","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":15,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"25sj2n0BoAqYEq1IqWro","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":20,"_primary_term":1}
{"_index":"movies","_type":"_doc","_id":"3Jsj2n0BoAqYEq1Iq2pC","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":16,"_primary_term":1}
100


In [24]:
# query by movie_title
host = 'https://search-movies-zhkkpomplgb457xjek35mxp5dy.us-west-2.es.amazonaws.com'
index = 'movies'
url = str(host + '/' + index + '/' + '_search')
headers = {'Content-Type': "application/json", 'Accept': "application/json"}
search_json = {
    "query": {
        "match": {
          "movie_title": "John Wick"
        }
    }
}
r = requests.get(url, json=search_json, headers=headers, auth = HTTPBasicAuth('******', '*******'))
response_dict = json.loads(r.text)
hits = response_dict['hits']['hits']
hits[0]['_source']

{'actor_id': 6384,
 'actor_name': 'Keanu Reeves',
 'movie_id': 245891,
 'movie_title': 'John Wick',
 'movie_character': 'Jonathan "John" Wick',
 'movie_genre': ['Action', 'Thriller']}

In [26]:
# query by actor_name
host = 'https://search-movies-zhkkpomplgb457xjek35mxp5dy.us-west-2.es.amazonaws.com'
index = 'movies'
url = str(host + '/' + index + '/' + '_search')
headers = {'Content-Type': "application/json", 'Accept': "application/json"}
search_json = {
    "query": {
        "match": {
          "actor_name": "Brad Pitt"
        }
    }
}
r = requests.get(url, json=search_json, headers=headers, auth = HTTPBasicAuth('******', '*******'))
response_dict = json.loads(r.text)
hits = response_dict['hits']['hits']
print(len(hits))
hits[0]['_source']

9


{'actor_id': 287,
 'actor_name': 'Brad Pitt',
 'movie_id': 4922,
 'movie_title': 'The Curious Case of Benjamin Button',
 'movie_character': 'Benjamin Button',
 'movie_genre': ['Drama', 'Fantasy', 'Romance']}

In [31]:
# query by movie genre
host = 'https://search-movies-zhkkpomplgb457xjek35mxp5dy.us-west-2.es.amazonaws.com'
index = 'movies'
url = str(host + '/' + index + '/' + '_search')
headers = {'Content-Type': "application/json", 'Accept': "application/json"}
search_json = {
    "query": {
        "match": {
            "movie_genre": "Drama"
        }
    }
}
r = requests.get(url, json=search_json, headers=headers, auth = HTTPBasicAuth('*******', '*******'))
response_dict = json.loads(r.text)
hits = response_dict['hits']['hits']
print(len(hits))
for h in hits:
    print(h['_source'])

10
{'actor_id': 10990, 'actor_name': 'Emma Watson', 'movie_id': 84892, 'movie_title': 'The Perks of Being a Wallflower', 'movie_character': 'Sam', 'movie_genre': ['Drama']}
{'actor_id': 1245, 'actor_name': 'Scarlett Johansson', 'movie_id': 1548, 'movie_title': 'Ghost World', 'movie_character': 'Rebecca', 'movie_genre': ['Comedy', 'Drama']}
{'actor_id': 10990, 'actor_name': 'Emma Watson', 'movie_id': 96936, 'movie_title': 'The Bling Ring', 'movie_character': 'Nicki Moore', 'movie_genre': ['Drama', 'Crime']}
{'actor_id': 1892, 'actor_name': 'Matt Damon', 'movie_id': 489, 'movie_title': 'Good Will Hunting', 'movie_character': 'Will Hunting', 'movie_genre': ['Drama']}
{'actor_id': 2888, 'actor_name': 'Will Smith', 'movie_id': 1402, 'movie_title': 'The Pursuit of Happyness', 'movie_character': 'Chris Gardner', 'movie_genre': ['Drama']}
{'actor_id': 31, 'actor_name': 'Tom Hanks', 'movie_id': 857, 'movie_title': 'Saving Private Ryan', 'movie_character': 'Captain John H. Miller', 'movie_genre'

In [48]:
# query by movie genre
host = 'https://search-movies-zhkkpomplgb457xjek35mxp5dy.us-west-2.es.amazonaws.com'
index = 'movies'
url = str(host + '/' + index + '/' + '_search')
headers = {'Content-Type': "application/json", 'Accept': "application/json"}
search_json = {
    "query": {
        "bool": {
          "must": [
            {
              "match": {
                "actor_name": "Brad Pitt"
              }
            },
            {
              "match": {
                "movie_genre": "drama"
              }
            }
          ]
        }
    }
}
r = requests.get(url, json=search_json, headers=headers, auth = HTTPBasicAuth('*******', '*******'))
response_dict = json.loads(r.text)
print(response_dict)
hits = response_dict['hits']['hits']
print(len(hits))
for h in hits:
    print(h['_source'])

{'took': 10, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 7, 'relation': 'eq'}, 'max_score': 6.5736403, 'hits': [{'_index': 'movies', '_type': '_doc', '_id': 'epsj2n0BoAqYEq1IN2oB', '_score': 6.5736403, '_source': {'actor_id': 287, 'actor_name': 'Brad Pitt', 'movie_id': 4922, 'movie_title': 'The Curious Case of Benjamin Button', 'movie_character': 'Benjamin Button', 'movie_genre': ['Drama', 'Fantasy', 'Romance']}}, {'_index': 'movies', '_type': '_doc', '_id': 'e5sj2n0BoAqYEq1IOGow', '_score': 5.702986, '_source': {'actor_id': 287, 'actor_name': 'Brad Pitt', 'movie_id': 72190, 'movie_title': 'World War Z', 'movie_character': 'Gerry Lane', 'movie_genre': ['Action', 'Drama', 'Horror', 'Science Fiction', 'Thriller']}}, {'_index': 'movies', '_type': '_doc', '_id': 'gZsj2n0BoAqYEq1IQWpZ', '_score': 5.2926884, '_source': {'actor_id': 287, 'actor_name': 'Brad Pitt', 'movie_id': 466272, 'movie_title': 'Once Upon a Time… in 

In [15]:
# delete all indices
#host = 'https://search-movies-zhkkpomplgb457xjek35mxp5dy.us-west-2.es.amazonaws.com'
#index = 'movies'
#url = str(host + '/' + index)
#r = requests.delete(url, auth = HTTPBasicAuth('SuperJerry', 'SuperJerry1!'))
#r.text

'{"error":{"root_cause":[{"type":"index_not_found_exception","reason":"no such index [movies]","resource.type":"index_or_alias","resource.id":"movies","index_uuid":"_na_","index":"movies"}],"type":"index_not_found_exception","reason":"no such index [movies]","resource.type":"index_or_alias","resource.id":"movies","index_uuid":"_na_","index":"movies"},"status":404}'