In [1]:
import requests
from bs4 import BeautifulSoup as bs
import json
import numpy as np
import pandas as pd
import re

In [2]:
# Loading Pirates Of Caribbean wikipidea page, as a test movie
test_movie = requests.get('https://en.wikipedia.org/wiki/Pirates_of_the_Caribbean:_At_World%27s_End')
test_movie_content = bs(test_movie.content)

### Extracting the rows of the infobox-vevent table
This table contains all the information about the movie.

In [3]:
test_movie_rows = test_movie_content.select('table.infobox.vevent')[0].select('tr')
print("Movie Name:- ", test_movie_rows[0].get_text(" ", strip=True).replace('\xa0', " "))

Movie Name:-  Pirates of the Caribbean: At World's End


## get_infobox
get_infobox is a function that takes in the information rows (content of the infobox-vevent class) of a particular movie and returns it's contents in a dictionary format.

In [4]:
def get_infobox(info_rows):
    '''
    Input: 
    info_rows: rows extracted from the infobox-vevent table.
    
    Output:
    movie_info: Information about the movie, in a dictionary format, extracted from the infobox-vevent table.
    '''
    def remove_tag(tag, row):           # this function removes the content present inside the specified tag.
        for tag in row(tag):
            tag.decompose()
        
        if row.find(attrs={'colspan': 2}):
            row.decompose()
        else:
            pass
        
        return row
        
        
        
    
    def get_value(row):
        row = remove_tag(['span', 'sup'], row)
        
        if row.find('br'):
            return [text.replace('\xa0', ' ') for text in row.stripped_strings]
        
        elif row.find('li'):
            return [value.get_text(' ', strip=True).replace("\xa0", ' ') for value in row.select('li')]
        
        else:
            return row.get_text(' ', strip=True).replace('\xa0', ' ')
        
    
    movie_info = {}                                                   # Dictionary to store all the information of the movie
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['Title'] = row.get_text().replace('\xa0', ' ') # Name/Title of the movie
        elif index == 1:
            pass
        else:
            if row.find('th'):
                key = row.find('th').get_text(" ", strip=True).replace('\xa0', ' ')
                value = get_value(row.find('td'))
                movie_info[key] = value
            
    return movie_info

In [5]:
# Glance at the information collected from the infobox-vevent table.
get_infobox(test_movie_rows)

{'Title': "Pirates of the Caribbean:  At World's End",
 'Directed by': 'Gore Verbinski',
 'Produced by': 'Jerry Bruckheimer',
 'Written by': ['Ted Elliott', 'Terry Rossio'],
 'Based on': ['Characters',
  'by',
  'Ted Elliott',
  'Terry Rossio',
  'Stuart Beattie',
  'Jay Wolpert',
  'Pirates of the Caribbean',
  'by',
  'Walt Disney'],
 'Starring': ['Johnny Depp',
  'Orlando Bloom',
  'Keira Knightley',
  'Stellan Skarsgård',
  'Bill Nighy',
  'Chow Yun-fat',
  'Geoffrey Rush',
  'Jack Davenport',
  'Kevin R. McNally',
  'Jonathan Pryce'],
 'Music by': 'Hans Zimmer',
 'Cinematography': 'Dariusz Wolski',
 'Edited by': ['Craig Wood', 'Stephen Rivkin'],
 'Production companies': ['Walt Disney Pictures', 'Jerry Bruckheimer Films'],
 'Distributed by': 'Buena Vista Pictures',
 'Release date': ['May 19, 2007 ( Disneyland Resort )',
  'May 25, 2007 (United States)'],
 'Running time': '168 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$300 million',
 'Box office': '$

## get_movie_dict
This function returns all the movie names (having 'i' and 'a' tag in them) from the main_page_url page.

In [6]:
def get_movie_dict():
    main_page_url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
    main_page = requests.get(main_page_url)
    main_page_content = bs(main_page.content)
    movie_rows = main_page_content.select('table.wikitable.sortable i a')

    movies_dict = {}
    base_url = 'https://en.wikipedia.org/'
    for movie in movie_rows:
        movies_dict[movie.get_text()] = base_url + movie['href']
    
    return movies_dict

movies_dict = get_movie_dict()

## get_movies_infoboxes
This function takes in movies_dict as an argument, and returns the information of the movie contained in the infobox-vevent.

In [8]:
def get_movies_infobox(movies_dict):
    '''
    Input:
        movies_dict : Contains name and link of movies in the form of key value pair, where key refers to name of the movie,
        and it's corresponding value contains the url of the movie.
    
    Output:
        movie_infoboxes: Contains all the information of a movie present in the infobox-vevent class of the movie's page.
    '''
    movie_infoboxes = []
    for index, movie in enumerate(movies_dict):
        try:
            if index % 10 == 0:
                print(index)
            movie_url = movies_dict[movie]
            movie_page = requests.get(movie_url)
            movie_content = bs(movie_page.content)
            movie_inforows = movie_content.select('table.infobox.vevent')[0].select('tr')
            movie_infobox = get_infobox(movie_inforows)
            movie_infoboxes.append(movie_infobox)
   
        except Exception as e:
            print('Index:- ', index, 'Movie Name:- ', movie)
            print(e)
        
    return movie_infoboxes
    

In [9]:
movie_infoboxes = get_movies_infobox(movies_dict)

0
10
20
30
40
Index:-  43 Movie Name:-  Zorro the Avenger
'NoneType' object is not callable
Index:-  48 Movie Name:-  The Sign of Zorro
'NoneType' object is not callable
50
60
70
80
90
100
110
120
Index:-  124 Movie Name:-  True-Life Adventures
list index out of range
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
Index:-  425 Movie Name:-  Luca
list index out of range
430


In [10]:
len(movie_infoboxes)

429

## Saving the collected data to a json object.

In [11]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [12]:
#save_data("disney_data_new.json", movie_infoboxes)

## Loading the stored data

In [13]:
import json
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)
    
movies_infoboxes = load_data('disney_data_new.json')

# Data Pre-Processing
## Subtasks

1. ~~Clean up References [1]~~
2. ~~Convert running time into an integer~~
3. ~~Convert dates into Datetime object~~
4. ~~Split up the long strings~~
5. ~~Convert Budget and Box Office to numbers.~~

### Converting time into integer

In [14]:
import re
def time_to_int(time):
    if time == 'N/A':
        return None
    
    elif type(time) == str:
        return int(re.search('[1-9]+', time).group())
    
    else:
        return int(re.search('[1-9]+', time[0]).group())
        
for movie in movies_infoboxes:
    movie['Running time'] = time_to_int(movie.get('Running time', 'N/A'))


## Converting date to pandas datetime object

In [15]:
def budget_office_to_int(budget):
    
    def get_value(x):
        if '$' in x:
            if 'million' in x.lower():
                return float(re.search('\$([0-9,.]+)', x).group(1).replace(',', '')) * 1000000
            elif 'billion' in x.lower():
                return float(re.search('\$([0-9,.]+)', x).group(1).replace(',', '')) * 1000000000
            elif 'thousand' in x.lower():
                return float(re.search('\$([0-9,.]+)', x).group(1).replace(',', '')) * 1000
            else:
                return float(re.search('\$([0-9,.]+)', x).group(1).replace(',', ''))
        else:
            return None
                
    if type(budget) == str:
        value = get_value(budget)
        return value
    
    elif type(budget) == list:
        for item in budget:
            value = get_value(item)
            if value != None:
                return value
        return None
    
    else:
        return None
        
for index, movie in enumerate(movies_infoboxes):
    try:
        if movie['Title'] == 'Ponyo':
            movie['Budget'] = 34000000
            movie['Box office'] = 203200000
        else:
            movie['Budget'] = budget_office_to_int(movie.get('Budget', 'N/A'))
            movie['Box office'] = budget_office_to_int(movie.get('Box office', 'N/A'))
        
    except Exception as e:
        print('Index:-  ', index, '   Exception:- ', e)

## Converting 'Budget' and 'Box office' values to int

In [16]:
def date_to_pandas_datetime(date):

    if type(date) == str:
        return re.search('[0-9a-zA-Z\s,]+', date).group().strip()
    elif type(date) == list:
        return re.search('[0-9a-zA-Z\s,]+', date[0]).group().strip()
    else:
        return None

# for index, movie in enumerate(movies_infoboxes):
#     try:
#         movie['Release date'] = date_to_pandas_datetime(movie.get('Release date', None))
#     except Exception as e:
#         print('Index:-  ', index, '  Exception:- ', e)

In [19]:
movies_infoboxes[:2]

[{'Title': 'Academy Award Review of Walt Disney Cartoons',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937'],
  'Running time': 41,
  'Country': 'United States',
  'Language': 'English',
  'Box office': 45.472,
  'Budget': None},
 {'Title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Produced by': 'Walt Disney',
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['Frank Churchill', 'Pa

In [18]:
# Saving our pre-processed data
save_data('processed_movie_infoboxes', movies_infoboxes)