In [1]:
import bs4
from bs4 import BeautifulSoup
import requests

import os

import pandas as pd
import numpy as np

In [2]:
imdb_url = 'https://www.imdb.com/title/'
imdb_parental_guide = '/parentalguide'
imdb_release = '/releaseinfo?ref_=tt_dt_dt'
imdb_credits = '/fullcredits?ref_=tt_cl_sm'

In [3]:
xl = pd.read_excel('blank-check-data.xlsx', sheet_name=None, index_col='code')
df_films = xl['films']
df_miniseries = xl['miniseries']

In [4]:
import datetime
import re

def get_certification_info_for_imdb_id(imdb_id):
    parental_guide_url = imdb_url + imdb_id + imdb_parental_guide
    print(parental_guide_url)
    response = requests.get(parental_guide_url)
    html = response.text

    soup = BeautifulSoup(html, "html.parser")
    
    certifications_list = soup.find_all('tr', id='certifications-list')
    certifications = {'id': imdb_id}
    if len(certifications_list) == 0:
        print('no certifications found')
        return None
    for list_item in certifications_list[0].find_all('a'):
        cert_text = list_item.text.split(':')
        if cert_text[0] == 'United States' and 'TV' not in cert_text[1]:
            return cert_text[1]
    return None

def get_release_date_from_info(release_date_info):
    release_date_string = release_date_info.find_all('td', class_='release-date-item__date')[0].text
    try:
        return datetime.datetime.strptime(release_date_string, '%d %B %Y')
    except:
        return None
    

def get_release_date_for_imdb_id(imdb_id):
    release_url = imdb_url + imdb_id + imdb_release
    print(release_url)
    response = requests.get(release_url)
    html = response.text

    soup = BeautifulSoup(html, "html.parser")
    release_date_table = soup.find_all('table', class_='release-dates-table-test-only')
    if len(release_date_table) == 0:
        print('no release dates found')
        return None
    default_release = None
    for release_date_info in release_date_table[0].find_all('tr', class_='release-date-item'):
        attributes_empty = release_date_info.find_all('td', class_='release-date-item__attributes--empty')
        country = release_date_info.find_all('td', class_='release-date-item__country-name')[0].text.strip()
        if len(attributes_empty) > 0 and country == 'USA':
            return get_release_date_from_info(release_date_info)
        else:
            default_release = get_release_date_from_info(release_date_info)
    return default_release
            
def parse_box_office_amount(box_office_block):
    block_text = box_office_block.text
    h4_block = box_office_block.find('h4')
    if h4_block is not None:
        block_text = block_text.replace(h4_block.text, '')
    attribute_block = box_office_block.find('span', class_='attribute')
    if attribute_block is not None:
        block_text = block_text.replace(attribute_block.text, '')
    block_text = block_text.replace(',', '').strip()
    currency = re.sub("[0-9]", "", block_text)
    amount = re.sub("[^0-9]", "", block_text)
    return amount, currency

def get_info_for_imdb_id(imdb_id):
    main_url = imdb_url + imdb_id
    print(main_url)
    response = requests.get(main_url)
    html = response.text

    soup = BeautifulSoup(html, "html.parser")
    technical_specs_header = soup.find('h3', text='Technical Specs')
    runtime_block = technical_specs_header.find_next_sibling('div')
    runtime = None
    if runtime_block is not None and runtime_block.find('time') is not None:
        runtime_text = runtime_block.find('time').text
        runtime = int(runtime_text.replace('min', '').strip())
        
    box_office_header = soup.find('h3', text='Box Office')
    budget = None
    budget_curr = None
    opening_weekend = None
    opening_weekend_curr = None
    gross_usa = None
    gross_usa_curr = None
    gross_worldwide = None
    gross_worldwide_curr = None
    if box_office_header is not None:
        box_office_block = box_office_header.find_next_sibling()
        while box_office_block.name == 'div':
            if 'Budget:' in box_office_block.text:
                budget, budget_curr = parse_box_office_amount(box_office_block)
            if 'Opening Weekend USA:' in box_office_block.text:
                opening_weekend, opening_weekend_curr = parse_box_office_amount(box_office_block)
            if 'Gross USA:' in box_office_block.text:
                gross_usa, gross_usa_curr = parse_box_office_amount(box_office_block)
            if 'Cumulative Worldwide Gross:' in box_office_block.text:
                gross_worldwide, gross_worldwide_curr = parse_box_office_amount(box_office_block)
            box_office_block = box_office_block.find_next_sibling()
            
    certification = get_certification_info_for_imdb_id(imdb_id)
    release_date = get_release_date_for_imdb_id(imdb_id)
    return {
        'imdb_id': imdb_id,
        'certification': certification,
        'release_date': release_date,
        'runtime': runtime,
        'budget': budget,
        'budget_curr': budget_curr,
        'opening_weekend': opening_weekend,
        'opening_weekend_curr': opening_weekend_curr,
        'gross_usa': gross_usa,
        'gross_usa_curr': gross_usa_curr,
        'gross_worldwide': gross_worldwide,
        'gross_wordlwide_curr': gross_worldwide_curr
    }

In [5]:
def process_movie_for_imdb(film_info):
    imdb_info = get_info_for_imdb_id(film_info['imdb_id'])
    imdb_info['title'] = film_info['title']
    imdb_info['miniseries_code'] = film_info['miniseries_code']
    imdb_info['miniseries_name'] = df_miniseries.loc[film_info['miniseries_code'],'name']
    imdb_info['is_patreon'] = film_info['is_patreon']
    return imdb_info

In [None]:
df_films = pd.DataFrame([
    process_movie_for_imdb(film_info)
    for idx, film_info in df_films.iterrows()
])

https://www.imdb.com/title/tt0120915
https://www.imdb.com/title/tt0120915/parentalguide
https://www.imdb.com/title/tt0120915/releaseinfo?ref_=tt_dt_dt
https://www.imdb.com/title/tt1872194
https://www.imdb.com/title/tt1872194/parentalguide
https://www.imdb.com/title/tt1872194/releaseinfo?ref_=tt_dt_dt
https://www.imdb.com/title/tt0121765
https://www.imdb.com/title/tt0121765/parentalguide
https://www.imdb.com/title/tt0121765/releaseinfo?ref_=tt_dt_dt
https://www.imdb.com/title/tt0109770
https://www.imdb.com/title/tt0109770/parentalguide
https://www.imdb.com/title/tt0109770/releaseinfo?ref_=tt_dt_dt
https://www.imdb.com/title/tt0120667
https://www.imdb.com/title/tt0120667/parentalguide
https://www.imdb.com/title/tt0120667/releaseinfo?ref_=tt_dt_dt
https://www.imdb.com/title/tt0486576
https://www.imdb.com/title/tt0486576/parentalguide
https://www.imdb.com/title/tt0486576/releaseinfo?ref_=tt_dt_dt
https://www.imdb.com/title/tt1502712
https://www.imdb.com/title/tt1502712/parentalguide
https:

In [None]:
df_films.head()

In [None]:
def extract_imdb_id_from_link(elem):
    return elem['href'].split('/')[2]

def get_crew_credits(crew_table, table_type):
    return [
        {
            'name': crew_item.find('td', class_='name').text.strip(),
            'person_imdb_id': extract_imdb_id_from_link(crew_item.find('a')),
            'info': table_type,
            'is_cast': False
        }
        for crew_item in crew_table.find_all('tr') if crew_item.find('td', class_='name')
    ]

def get_cast_credits(cast_table):
    return [
        {
            'name': cast_item.find_all('td')[1].text.strip(),
            'person_imdb_id': extract_imdb_id_from_link(cast_item.find_all('td')[1].find('a')),
            'info': cast_item.find('td', class_='character').text.replace('\n', '').strip(),
            'is_cast': True
        }
        for cast_item in cast_table.find_all('tr') if len(cast_item.find_all('td')) > 1
    ]

def get_credit_block_by_header(data_header):
    if 'id' in data_header.attrs and data_header.attrs['id']=='cast':
        return get_cast_credits(data_header.find_next_sibling('table'))
    return get_crew_credits(data_header.find_next_sibling('table'), data_header.text.replace('\n', '').replace('\xa0', '').strip())


def get_credits_for_imdb_id(imdb_id):
    credits_url = imdb_url + imdb_id + imdb_credits
    print(credits_url)
    response = requests.get(credits_url)
    html = response.text

    soup = BeautifulSoup(html, "html.parser")
    
    film_credits = []
    fullcredits_content = soup.find('div', id='fullcredits_content')
    for data_header in fullcredits_content.find_all('h4', class_='dataHeaderWithBorder'):
        film_credits = film_credits + get_credit_block_by_header(data_header)
    return film_credits

In [None]:
credits = []
for idx, row in df_films.iterrows():
    film_credits = get_credits_for_imdb_id(row['imdb_id'])
    for credit in film_credits:
        credit['film_imdb_id'] = row['imdb_id']
        credit['film_title'] = row['title']
        credit['miniseries_code'] = row['miniseries_code']
        credit['miniseries_name'] = row['miniseries_name']
        credit['is_patreon'] = row['is_patreon']
    credits += film_credits

In [None]:
df_credits = pd.DataFrame(credits)
print(len(credits))
df_credits.head()

In [None]:
writer = pd.ExcelWriter('blank-check-data.xlsx')
df_miniseries.to_excel(writer, 'miniseries', index_label='code')
df_films.to_excel(writer, 'films', index_label='code')
df_credits.to_excel(writer, 'credits', index_label='code')
writer.save()