In [1]:
import numpy as np
import pandas as pd
import requests 
from bs4 import BeautifulSoup as bs
import json

## Rough, Single Movie-Page exploration

In [2]:
# Loading Pirates Of Caribbean wikipidea page as poc
poc = requests.get('https://en.wikipedia.org/wiki/Pirates_of_the_Caribbean:_At_World%27s_End')
poc_content = bs(poc.content)

### The table class, infobox-vevent, stores all the movie information 

In [3]:
poc_rows = poc_content.select('table.infobox.vevent')[0].select('tr')

In [4]:
poc_rows[0].get_text(" ", strip=True).replace('\xa0', " ")

"Pirates of the Caribbean: At World's End"

In [5]:
poc_rows[4].find('th').get_text()

'Written by'

In [6]:
poc_rows[16]('sup')

[<sup class="reference" id="cite_ref-BoxOfficeMojo_2-0"><a href="#cite_note-BoxOfficeMojo-2">[2]</a></sup>]

In [7]:
#if poc_rows[4].select('td li'):
    #print('ok')

In [8]:
[text for text in poc_rows[6].find('td').stripped_strings]

['Johnny Depp',
 'Orlando Bloom',
 'Keira Knightley',
 'Stellan Skarsgård',
 'Bill Nighy',
 'Chow Yun-fat',
 'Geoffrey Rush',
 'Jack Davenport',
 'Kevin R. McNally',
 'Jonathan Pryce']

In [224]:
def get_infobox(info_rows):
    
    def remove_tag(tag, row):           # this function removes the content present inside the specified tag.
        for tag in row(tag):
            tag.decompose()
        
        if row.find(attrs={'colspan': 2}):
            row.decompose()
        else:
            pass
        
        return row
        
        
        
    
    def get_value(row):
        row = remove_tag(['span', 'sup'], row)
        
        if row.find('br'):
            return [text.replace('\xa0', ' ') for text in row.stripped_strings]
        
        elif row.find('li'):
            return [value.get_text(' ', strip=True).replace("\xa0", ' ') for value in row.select('li')]
        
        else:
            return row.get_text(' ', strip=True).replace('\xa0', ' ')
        
    
    movie_dict = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_dict['Title'] = row.get_text().replace('\xa0', ' ')
        elif index == 1:
            pass
        else:
            if row.find('th'):
                key = row.find('th').get_text(" ", strip=True).replace('\xa0', ' ')
                value = get_value(row.find('td'))
                movie_dict[key] = value
            
    return movie_dict

In [225]:
[text for text in poc_rows[5].find('td').stripped_strings]

['Characters',
 'by',
 'Ted Elliott',
 'Terry Rossio',
 'Stuart Beattie',
 'Jay Wolpert',
 'Pirates of the Caribbean',
 'by',
 'Walt Disney']

In [226]:
get_infobox(poc_rows)

{'Title': "Pirates of the Caribbean:  At World's End",
 'Directed by': 'Gore Verbinski',
 'Produced by': 'Jerry Bruckheimer',
 'Written by': ['Ted Elliott', 'Terry Rossio'],
 'Based on': ['Characters',
  'by',
  'Ted Elliott',
  'Terry Rossio',
  'Stuart Beattie',
  'Jay Wolpert',
  'Pirates of the Caribbean',
  'by',
  'Walt Disney'],
 'Starring': ['Johnny Depp',
  'Orlando Bloom',
  'Keira Knightley',
  'Stellan Skarsgård',
  'Bill Nighy',
  'Chow Yun-fat',
  'Geoffrey Rush',
  'Jack Davenport',
  'Kevin R. McNally',
  'Jonathan Pryce'],
 'Music by': 'Hans Zimmer',
 'Cinematography': 'Dariusz Wolski',
 'Edited by': ['Craig Wood', 'Stephen Rivkin'],
 'Production companies': ['Walt Disney Pictures', 'Jerry Bruckheimer Films'],
 'Distributed by': 'Buena Vista Pictures',
 'Release date': ['May 19, 2007 ( Disneyland Resort )',
  'May 25, 2007 (United States)'],
 'Running time': '168 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$300 million',
 'Box office': '$

In [227]:
test_page = requests.get(movies_dict[movie_rows[325].get_text()])
test_page_content = bs(test_page.content)
test_page_rows = test_page_content.select('table.infobox.vevent')[0].select('tr')

In [228]:
get_infobox(test_page_rows)

{'Title': 'Up',
 'Directed by': 'Pete Docter',
 'Produced by': 'Jonas Rivera',
 'Screenplay by': ['Bob Peterson', 'Pete Docter'],
 'Story by': ['Pete Docter', 'Bob Peterson', 'Tom McCarthy'],
 'Starring': ['Ed Asner', 'Christopher Plummer', 'Jordan Nagai'],
 'Music by': 'Michael Giacchino',
 'Cinematography': ['Patrick Lin', 'Jean-Claudie Kalache'],
 'Edited by': 'Kevin Nolting',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['May 29, 2009'],
 'Running time': '96 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$175 million',
 'Box office': '$735.1 million'}

In [76]:
main_page_url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
main_page = requests.get(main_page_url)
main_page_content = bs(main_page.content)
movie_rows = main_page_content.select('table.wikitable.sortable i a')

In [77]:
# movie_rows

In [78]:
def get_movie_dict():
    movies_dict = {}
    base_url = 'https://en.wikipedia.org/'
    for movie in movie_rows:
        movies_dict[movie.get_text()] = base_url + movie['href']
    
    return movies_dict

movies_dict = get_movie_dict()

In [79]:
def get_movies_infobox(movies_dict):
    movie_infoboxes = []
    for index, movie in enumerate(movies_dict):
        try:
            if index % 10 == 0:
                print(index)
            movie_url = movies_dict[movie]
            movie_page = requests.get(movie_url)
            movie_content = bs(movie_page.content)
            movie_inforows = movie_content.select('table.infobox.vevent')[0].select('tr')
            movie_infobox = get_infobox(movie_inforows)
            movie_infoboxes.append(movie_infobox)
   
        except Exception as e:
            print('Index:- ', index, 'Movie Name:- ', movie)
            print(e)
        
    return movie_infoboxes
    

In [80]:
# movies_dict = get_movie_dict()

In [81]:
movie_infoboxes = get_movies_infobox(movies_dict)

0
10
20
30
40
Index:-  43 Movie Name:-  Zorro the Avenger
'NoneType' object is not callable
Index:-  48 Movie Name:-  The Sign of Zorro
'NoneType' object is not callable
50
60
70
80
90
100
110
120
Index:-  124 Movie Name:-  True-Life Adventures
list index out of range
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
Index:-  425 Movie Name:-  Luca
list index out of range
430


In [82]:
len(movie_infoboxes)

429

### Saving Data in json

In [83]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [84]:
save_data("disney_data_new.json", movie_infoboxes)

### Loading Data

In [2]:
import json
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)
    
movies_infoboxes = load_data('disney_data_new.json')

## Subtasks

1. ~~Clean up References [1]~~
2. ~~Convert running time into an integer~~
3. ~~Convert dates into Datetime object~~
4. ~~Split up the long strings~~
5. ~~Convert Budget and Box Office to numbers.~~

### Converting time into integer

In [3]:
import re
def time_to_int(time):
    if time == 'N/A':
        return None
    
    elif type(time) == str:
        return int(re.search('[1-9]+', time).group())
    
    else:
        return int(re.search('[1-9]+', time[0]).group())
        
for movie in movies_infoboxes:
    movie['Running time'] = time_to_int(movie.get('Running time', 'N/A'))


In [4]:
# for index, movie in enumerate(movies_infoboxes):
#     print(movie.get('Budget', None), '   ', index)

In [5]:
# movies_infoboxes[392]

## Converting 'Budget' and 'Box office' values to int

In [6]:
def budget_office_to_int(budget):
    
    def get_value(x):
        if '$' in x:
            if 'million' in x.lower():
                return float(re.search('\$([0-9,.]+)', x).group(1).replace(',', '')) * 1000000
            elif 'billion' in x.lower():
                return float(re.search('\$([0-9,.]+)', x).group(1).replace(',', '')) * 1000000000
            elif 'thousand' in x.lower():
                return float(re.search('\$([0-9,.]+)', x).group(1).replace(',', '')) * 1000
            else:
                return float(re.search('\$([0-9,.]+)', x).group(1).replace(',', ''))
        else:
            return None
                
    if type(budget) == str:
        value = get_value(budget)
        return value
    
    elif type(budget) == list:
        for item in budget:
            value = get_value(item)
            if value != None:
                return value
        return None
    
    else:
        return None
        
for index, movie in enumerate(movies_infoboxes):
    try:
        if movie['Title'] == 'Ponyo':
            movie['Budget'] = 34000000
            movie['Box office'] = 203200000
        else:
            movie['Budget'] = budget_office_to_int(movie.get('Budget', 'N/A'))
            movie['Box office'] = budget_office_to_int(movie.get('Box office', 'N/A'))
        
    except Exception as e:
        print('Index:-  ', index, '   Exception:- ', e)

In [7]:
# for index, movie in enumerate(movies_infoboxes):
#     print('Index:-  ', index)
#     print('Movie Budget:-  ', movie['Budget'])
#     print('Movie Box Office:- ', movie['Box office'])

## Converting date to pandas datetime object

In [47]:
def date_to_pandas_datetime(date):

    if type(date) == str:
        return re.search('[0-9a-zA-Z\s,]+', date).group().strip()
    elif type(date) == list:
        return re.search('[0-9a-zA-Z\s,]+', date[0]).group().strip()
    else:
        return None

# for index, movie in enumerate(movies_infoboxes):
#     try:
#         movie['Release date'] = date_to_pandas_datetime(movie.get('Release date', None))
#     except Exception as e:
#         print('Index:-  ', index, '  Exception:- ', e)