In [21]:
import numpy as np
import pandas as pd
from scipy import spatial 

import matplotlib.pyplot as plt

import itertools
import networkx as nx
from  pyvis.network import Network

from tqdm import tqdm

import requests
import json
import time


In [22]:
global api_key
api_key = '25293e7b9b64aff7d01c3231f221c6a9'

In [23]:
def get_db(content_type, start_date, num_pages):

    #URL for GET request, updated for each function call.
    URL = f'https://api.themoviedb.org/3/discover/{content_type}?primary_release_date.gte={{}}&api_key={{}}&language=en-UK&page={{}}'

    content_df = pd.DataFrame()

    #Generate a request for each page. 
    for page in tqdm(range(1, num_pages+1)):
        formatted_url = URL.format(start_date, api_key, num_pages)
        response = requests.get(formatted_url)

        #Convert the response into the JSON data strucutre. 
        results = response.json()
        df = pd.json_normalize(results['results'])

        #Append data of each request into main dataframe. 
        if content_df.empty:
            content_df = df
        else:
            content_df = pd.concat([content_df, df])
        
        time.sleep(1) #too many requests at once will lead to timeout. 

    #Format the requests for consistency
    if content_type == "movie":
        content_df = content_df[['id', 'title', 'overview']]

    elif content_type == "tv":
        content_df = content_df[['id', 'name', 'overview']].rename(columns={'id':'id', 'name': 'title', 'overview':'overview'})

    return content_df


def get_cast(content_type, id):
    
    try:
        #Get request 
        url = f'https://api.themoviedb.org/3/{content_type}/{id}/credits?api_key={api_key}&language=en-UK'
        response = requests.get(url)
        
        #Convert the response into JSON data structure and transform into a table. 
        results = response.json()
        df = pd.json_normalize(results['cast'])
        
        return list(df['name'])
    
    except:
        return 'n/a'
    
    

def get_reviews(content_type, id):
    
    try:
        #Get request
        url = f'https://api.themoviedb.org/3/{content_type}/{id}/reviews?api_key={api_key}&language=en-UK'
        response = requests.get(url)

        #Convert the response into JSON data structure and transform into a table.
        results = response.json()

        if results['total_results'] != 0:
            return [result['content'] for result in results['results']]
        
        else:
            return []
        
    except:
        return []

In [24]:
%%time

num_pages = 10

#Get data on movies.
movie_df = get_db('movie','1970-01-01',num_pages)
movie_df['content_type'] = 'movie'

#Get data on tv shows.
tv_df = get_db('tv','1970-01-01',num_pages)
tv_df['content_type'] = 'tv'

df = pd.concat((movie_df,tv_df))

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:11<00:00,  1.11s/it]
100%|██████████| 10/10 [00:11<00:00,  1.11s/it]

CPU times: total: 406 ms
Wall time: 22.3 s





In [25]:
#Get cast data for every TV and Movie data point. 
df['cast'] = df.apply(lambda x : get_cast(x['content_type'], x['id']), axis=1)

#Compute number of missing cast data
print(f"{round((len(df[df.cast == 'n/a'])/len(df))*100,2)}% of casts unavailable from API")
df = df[df.cast != 'n/a']

2.5% of casts unavailable from API


In [26]:
df.shape
df.columns

Index(['id', 'title', 'overview', 'content_type', 'cast'], dtype='object')

In [27]:
df[['title','overview','content_type','cast']]

Unnamed: 0,title,overview,content_type,cast
0,Heroic,"Luis, an 18-year-old boy with Indigenous roots...",movie,"[Santiago Sandoval, Fernando Cuautle, Mónica d..."
1,M3GAN,A brilliant toy company roboticist uses artifi...,movie,"[Allison Williams, Violet McGraw, Ronny Chieng..."
2,Harry Potter and the Chamber of Secrets,"Cars fly, trees fight back, and a mysterious h...",movie,"[Daniel Radcliffe, Rupert Grint, Emma Watson, ..."
3,Greenland,"John Garrity, his estranged wife and their you...",movie,"[Gerard Butler, Morena Baccarin, David Denman,..."
4,Shadow Master,"After being slain by a group of criminals, a m...",movie,"[D.Y. Sao, Layton Matthews, Brian Le, Craig Ng..."
...,...,...,...,...
15,Wednesday,"Wednesday Addams is sent to Nevermore Academy,...",tv,"[Jenna Ortega, Emma Myers, Joy Sunday, Percy H..."
16,Spartacus,"Torn from his homeland and the woman he loves,...",tv,"[Liam McIntyre, Dustin Clare, Manu Bennett, Cy..."
17,Dragon Ball Z,The adventures of Earth's martial arts defende...,tv,"[Masako Nozawa, Ryou Horikawa, Toshio Furukawa..."
18,The X-Files,The exploits of FBI Special Agents Fox Mulder ...,tv,"[David Duchovny, Gillian Anderson, Mitch Pileggi]"


In [28]:
permutations = list(itertools.combinations(df['title'],2))
network_df = pd.DataFrame(permutations, columns=['Content_1','Content_2'])

In [29]:
network_df = network_df.merge(df[['title','cast']], left_on='Content_1', right_on='title').drop('title',axis=1)
network_df = network_df.merge(df[['title','cast']], left_on='Content_2', right_on='title').drop('title',axis=1)

In [30]:
network_df['common'] = network_df.apply(lambda x : set(x['cast_x']).intersection(set(x['cast_y'])), axis=1)
network_df['num_common'] = network_df.apply(lambda x : len(x['common']), axis=1)
network_df['cast_similarity'] = network_df.apply(lambda x : round(x['num_common'] / max(len(x['cast_x']),len(x['cast_y'])),2), axis=1)
network_df

Unnamed: 0,Content_1,Content_2,cast_x,cast_y,common,num_common,cast_similarity
0,Heroic,M3GAN,"[Santiago Sandoval, Fernando Cuautle, Mónica d...","[Allison Williams, Violet McGraw, Ronny Chieng...",{},0,0.0
1,Heroic,M3GAN,"[Santiago Sandoval, Fernando Cuautle, Mónica d...","[Allison Williams, Violet McGraw, Ronny Chieng...",{},0,0.0
2,Heroic,M3GAN,"[Santiago Sandoval, Fernando Cuautle, Mónica d...","[Allison Williams, Violet McGraw, Ronny Chieng...",{},0,0.0
3,Heroic,M3GAN,"[Santiago Sandoval, Fernando Cuautle, Mónica d...","[Allison Williams, Violet McGraw, Ronny Chieng...",{},0,0.0
4,Heroic,M3GAN,"[Santiago Sandoval, Fernando Cuautle, Mónica d...","[Allison Williams, Violet McGraw, Ronny Chieng...",{},0,0.0
...,...,...,...,...,...,...,...
7585495,Modern Family,Modern Family,"[Ed O'Neill, Sofía Vergara, Julie Bowen, Ty Bu...","[Ed O'Neill, Sofía Vergara, Julie Bowen, Ty Bu...","{Ariel Winter, Ed O'Neill, Eric Stonestreet, J...",13,1.0
7585496,Modern Family,Modern Family,"[Ed O'Neill, Sofía Vergara, Julie Bowen, Ty Bu...","[Ed O'Neill, Sofía Vergara, Julie Bowen, Ty Bu...","{Ariel Winter, Ed O'Neill, Eric Stonestreet, J...",13,1.0
7585497,Modern Family,Modern Family,"[Ed O'Neill, Sofía Vergara, Julie Bowen, Ty Bu...","[Ed O'Neill, Sofía Vergara, Julie Bowen, Ty Bu...","{Ariel Winter, Ed O'Neill, Eric Stonestreet, J...",13,1.0
7585498,Modern Family,Modern Family,"[Ed O'Neill, Sofía Vergara, Julie Bowen, Ty Bu...","[Ed O'Neill, Sofía Vergara, Julie Bowen, Ty Bu...","{Ariel Winter, Ed O'Neill, Eric Stonestreet, J...",13,1.0


In [31]:
tr = 4

network_df['num_common'] = network_df['num_common'].map(lambda x : x if x >= tr else 0)
network_df = network_df[network_df['num_common']!=0]

network_df.head()

Unnamed: 0,Content_1,Content_2,cast_x,cast_y,common,num_common,cast_similarity
5500,M3GAN,M3GAN,"[Allison Williams, Violet McGraw, Ronny Chieng...","[Allison Williams, Violet McGraw, Ronny Chieng...","{Cameron Randell, Clinton Randell, Jenna Davis...",26,1.0
5501,M3GAN,M3GAN,"[Allison Williams, Violet McGraw, Ronny Chieng...","[Allison Williams, Violet McGraw, Ronny Chieng...","{Cameron Randell, Clinton Randell, Jenna Davis...",26,1.0
5502,M3GAN,M3GAN,"[Allison Williams, Violet McGraw, Ronny Chieng...","[Allison Williams, Violet McGraw, Ronny Chieng...","{Cameron Randell, Clinton Randell, Jenna Davis...",26,1.0
5503,M3GAN,M3GAN,"[Allison Williams, Violet McGraw, Ronny Chieng...","[Allison Williams, Violet McGraw, Ronny Chieng...","{Cameron Randell, Clinton Randell, Jenna Davis...",26,1.0
5504,M3GAN,M3GAN,"[Allison Williams, Violet McGraw, Ronny Chieng...","[Allison Williams, Violet McGraw, Ronny Chieng...","{Cameron Randell, Clinton Randell, Jenna Davis...",26,1.0


In [32]:
G = nx.from_pandas_edgelist(network_df, source='Content_1', target='Content_2', edge_attr='cast_similarity')

In [35]:
net = Network(notebook=True, cdn_resources='remote')
net.from_nx(G)
net.show('example.html')

example.html
