# TheMovieDatabase (TMBD) API

### Import modules

In [1]:
import numpy as np
import pandas as pd

from scipy import spatial

import matplotlib.pyplot as plt

import itertools
import networkx as nx
from pyvis.network import Network

from tqdm import tqdm

import requests
import json
import time

In [2]:
global api_key
api_key = '' #INSERT API KEY

### Custom API functions

In [3]:
def get_db(content_type, start_date, num_pages):
    
    url = f'https://api.themoviedb.org/3/discover/{content_type}?primary_release_date.gte={{}}&api_key={{}}&language=en-UK&page={{}}'
    content_df = pd.DataFrame({})
    for page in tqdm(range(1,num_pages+1)):
        formatted_url = url.format(start_date, api_key, page)
        response = requests.get(formatted_url)
        results = response.json()
        df = pd.json_normalize(results['results'])
        if content_df.empty:
            content_df = df
        else:
            content_df = pd.concat([content_df,df])
        time.sleep(1)
    
    if content_type=='movie':
        content_df = content_df[['id','title','overview']]
    elif content_type=='tv':
        content_df = content_df[['id','name','overview']].rename(columns={'id':'id','name':'title','overview':'overview'})
        
    return content_df


def get_cast(content_type, id):
    
    try:
        url = f'https://api.themoviedb.org/3/{content_type}/{id}/credits?api_key={api_key}&language=en-UK'
        response = requests.get(url)
        results = response.json()
        df = pd.json_normalize(results['cast'])
        return list(df['name'])
    except:
        return 'n/a'
    
    
def get_reviews(content_type, id):
    
    try:
        url = f'https://api.themoviedb.org/3/{content_type}/{id}/reviews?api_key={api_key}&language=en-UK'
        response = requests.get(url)
        results = response.json()
        if results['total_results'] != 0:
            return [result['content'] for result in results['results']]
        else:
            return []
    except:
        return []

### Get content from API

In [4]:
%%time

num_pages = 100
movie_df = get_db('movie','1970-01-01',num_pages)
movie_df['content_type'] = 'movie'
tv_df = get_db('tv','1970-01-01',num_pages)
tv_df['content_type'] = 'tv'
df = pd.concat((movie_df,tv_df))

100%|██████████| 100/100 [03:03<00:00,  1.84s/it]
100%|██████████| 100/100 [02:26<00:00,  1.47s/it]

CPU times: user 3.48 s, sys: 196 ms, total: 3.68 s
Wall time: 5min 30s





### Get cast list from API

In [5]:
df['cast'] = df.apply(lambda x : get_cast(x['content_type'], x['id']), axis=1)
print(f"{round((len(df[df.cast == 'n/a'])/len(df))*100,2)}% of casts unavailable from API")
df = df[df.cast != 'n/a']

3.57% of casts unavailable from API


### Final table

In [6]:
df.shape

(3857, 5)

In [7]:
df.columns

Index(['id', 'title', 'overview', 'content_type', 'cast'], dtype='object')

In [8]:
df[['title','overview','content_type','cast']]

Unnamed: 0,title,overview,content_type,cast
0,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,movie,"[Tom Holland, Zendaya, Benedict Cumberbatch, J..."
1,Encanto,"The tale of an extraordinary family, the Madri...",movie,"[Stephanie Beatriz, María Cecilia Botero, John..."
2,The King's Man,As a collection of history's worst tyrants and...,movie,"[Ralph Fiennes, Harris Dickinson, Gemma Artert..."
3,Scream,Twenty-five years after a streak of brutal mur...,movie,"[Neve Campbell, Courteney Cox, David Arquette,..."
4,The Ice Age Adventures of Buck Wild,The fearless one-eyed weasel Buck teams up wit...,movie,"[Simon Pegg, Vincent Tong, Aaron Harris, Utkar..."
...,...,...,...,...
15,The Wonderful World of Mickey Mouse,It's nothing but fun and excitement for Mickey...,tv,"[Chris Diamantopoulos, Kaitlyn Robrock, Bill F..."
16,Shadow and Bone,In a world cleaved in two by a massive barrier...,tv,"[Jessie Mei Li, Archie Renaux, Freddy Carter, ..."
17,The Little Mermaid,Disney's The Little Mermaid is an American ani...,tv,"[Jodi Benson, Jim Cummings, Samuel E. Wright, ..."
18,Utopia,A group of young adults who met online are mer...,tv,"[John Cusack, Rainn Wilson, Dan Byrd, Cory Mic..."


### Cast similarity analysis

In [9]:
permutations = list(itertools.combinations(df['title'],2))
network_df = pd.DataFrame(permutations, columns=['Content_1','Content_2'])

In [10]:
network_df = network_df.merge(df[['title','cast']], left_on='Content_1', right_on='title').drop('title',axis=1)
network_df = network_df.merge(df[['title','cast']], left_on='Content_2', right_on='title').drop('title',axis=1)

In [11]:
network_df['common'] = network_df.apply(lambda x : set(x['cast_x']).intersection(set(x['cast_y'])), axis=1)
network_df['num_common'] = network_df.apply(lambda x : len(x['common']), axis=1)
network_df['cast_similarity'] = network_df.apply(lambda x : round(x['num_common'] / max(len(x['cast_x']),len(x['cast_y'])),2), axis=1)
network_df

Unnamed: 0,Content_1,Content_2,cast_x,cast_y,common,num_common,cast_similarity
0,Spider-Man: No Way Home,Encanto,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...","[Stephanie Beatriz, María Cecilia Botero, John...",{},0,0.00
1,Spider-Man: No Way Home,The King's Man,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...","[Ralph Fiennes, Harris Dickinson, Gemma Artert...",{Rhys Ifans},1,0.01
2,Encanto,The King's Man,"[Stephanie Beatriz, María Cecilia Botero, John...","[Ralph Fiennes, Harris Dickinson, Gemma Artert...",{},0,0.00
3,Spider-Man: No Way Home,Scream,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...","[Neve Campbell, Courteney Cox, David Arquette,...",{},0,0.00
4,Spider-Man: No Way Home,Scream,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...","[David Arquette, Neve Campbell, Courteney Cox,...",{},0,0.00
...,...,...,...,...,...,...,...
8300489,Les échangistes,Humans,[Pénélope McQuade],"[Katherine Parkinson, Gemma Chan, Emily Berrin...",{},0,0.00
8300490,Third Watch,Humans,"[Coby Bell, Nia Long, Molly Price, Anthony Rui...","[Katherine Parkinson, Gemma Chan, Emily Berrin...",{},0,0.00
8300491,The Wonderful World of Mickey Mouse,Humans,"[Chris Diamantopoulos, Kaitlyn Robrock, Bill F...","[Katherine Parkinson, Gemma Chan, Emily Berrin...",{},0,0.00
8300492,Shadow and Bone,Humans,"[Jessie Mei Li, Archie Renaux, Freddy Carter, ...","[Katherine Parkinson, Gemma Chan, Emily Berrin...",{},0,0.00


### Network analysis

In [15]:
tr = 4

network_df['num_common'] = network_df['num_common'].map(lambda x : x if x >= tr else 0)
network_df = network_df[network_df['num_common']!=0]

network_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Content_1,Content_2,cast_x,cast_y,common,num_common,cast_similarity
15,Scream,Scream,"[Neve Campbell, Courteney Cox, David Arquette,...","[Neve Campbell, Courteney Cox, David Arquette,...","{Sonia Ammar, Boomer Mays, Dylan Minnette, Dre...",32,1.0
16,Scream,Scream,"[Neve Campbell, Courteney Cox, David Arquette,...","[David Arquette, Neve Campbell, Courteney Cox,...","{Neve Campbell, Drew Barrymore, Skeet Ulrich, ...",9,0.22
17,Scream,Scream,"[David Arquette, Neve Campbell, Courteney Cox,...","[Neve Campbell, Courteney Cox, David Arquette,...","{Neve Campbell, Drew Barrymore, Skeet Ulrich, ...",9,0.22
18,Scream,Scream,"[David Arquette, Neve Campbell, Courteney Cox,...","[David Arquette, Neve Campbell, Courteney Cox,...","{Drew Barrymore, Lisa Beach, Courteney Cox, Ja...",41,1.0
1132,Uncharted,Uncharted,"[Tom Holland, Mark Wahlberg, Antonio Banderas,...","[Tom Holland, Mark Wahlberg, Antonio Banderas,...","{Pilou Asbæk, Georgia Goodman, Steven Waddingt...",27,1.0


In [16]:
G = nx.from_pandas_edgelist(network_df, source='Content_1', target='Content_2', edge_attr='cast_similarity')

In [17]:
net = Network(notebook=True)
net.from_nx(G)
net.show('example.html')