Import libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests

TASK 1:
Retrieve the summary of a particular James Bond film presented in the information box

In [35]:
result = requests.get("https://en.wikipedia.org/wiki/Dr._No_(film)")

soup = bs(result.content)
contents = soup.prettify()

contents;

In [34]:
summary = soup.find(class_="infobox vevent")
summary_rows = summary.find_all("tr")

In [4]:
#Create empty dictionary for storing info retrieved from summary box
film_summary = {}

#retrieve items one by one in sections with multiple td under one th
def retrieve_content_value_1(data):
    if data.find("a"):
        return [item.get_text(" ",strip=True).replace("\xa0"," ").replace("\n","") for item in row.find_all("a")]
    else:
        return data.get_text(" ",strip=True).replace("\xa0"," ").replace("\n","")

for index, row in enumerate(summary_rows):
    if index == 0:
        film_summary["Film Title"] = row.find("th").get_text(" ",strip=True)
    #index position 1 is skipped because it's the image
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ",strip=True)
        content_value = retrieve_content_value_1(row.find("td"))
        film_summary[content_key] = content_value
        
film_summary

#there are two problems with the data that will be cleaned later on
#1. repeated information for Release date
#2. Countries only showing the reference link rather than the actual countries

{'Film Title': 'Dr. No',
 'Directed by': ['Terence Young'],
 'Screenplay by': ['Richard Maibaum', 'Johanna Harwood', 'Berkely Mather'],
 'Based on': ['Dr. No', 'Ian Fleming'],
 'Produced by': ['Harry Saltzman', 'Albert R. Broccoli'],
 'Starring': ['Sean Connery',
  'Ursula Andress',
  'Joseph Wiseman',
  'Jack Lord',
  'Anthony Dawson',
  'Zena Marshall',
  'John Kitzmiller',
  'Eunice Gayson',
  'Bernard Lee'],
 'Cinematography': ['Ted Moore'],
 'Edited by': ['Peter R. Hunt'],
 'Music by': ['Monty Norman'],
 'Production company': ['Eon Productions'],
 'Distributed by': ['United Artists'],
 'Release date': '5 October 1962 ( 1962-10-05 ) (United Kingdom)',
 'Running time': '109 minutes',
 'Countries': ['[1]', '[2]'],
 'Language': 'English',
 'Budget': '$1.1 million',
 'Box office': '$59.5 million'}

TASK 2: 
Retrieve summaries of all James Bond films presented in the information box

In [30]:
result = requests.get("https://en.wikipedia.org/wiki/List_of_James_Bond_films")

soup = bs(result.content)
contents = soup.prettify()

contents;

In [6]:
films = soup.select(".wikitable.plainrowheaders.sortable i")
films[0].a["href"]

'/wiki/Dr._No_(film)'

In [7]:
def retrieve_content_value_2(data):
    if data.find("th"):
        return [item.get_text(" ",strip=True).replace("\xa0"," ") for item in row.find_all("td")]
    elif data.find("li"):
        return [text.replace("\xa0"," ") for text in data.stripped_strings]
    else:
        return data.get_text(" ",strip=True).replace("\xa0"," ")

#for removing unnecessary HTML scripts
def html_tags_cleaning(soup):
    for tags in soup.find_all(["sup","span"]):
        tags.decompose()
    
def retrieve_info_box(url):
    
    result = requests.get(url)
    soup = bs(result.content)
    summary = soup.find(class_="infobox vevent")
    summary_rows = summary.find_all("tr")
    
    html_tags_cleaning(soup)
    
    film_summary = {}
    for index, row in enumerate(summary_rows):
        if index == 0:
            film_summary["Film Title"] = row.find("th").get_text(" ",strip=True)
        else:
            header = row.find("th")
            if header:
                content_key = row.find("th").get_text(" ",strip=True)
                content_value = retrieve_content_value_2(row.find("td"))
                film_summary[content_key] = content_value
            
    return film_summary

In [8]:
retrieve_info_box("https://en.wikipedia.org/wiki/Spectre_(2015_film)")

{'Film Title': 'Spectre',
 'Directed by': 'Sam Mendes',
 'Screenplay by': ['John Logan',
  'Neal Purvis',
  'Robert Wade',
  'Jez Butterworth'],
 'Story by': ['John Logan', 'Neal Purvis', 'Robert Wade'],
 'Based on': 'James Bond by Ian Fleming',
 'Produced by': ['Michael G. Wilson', 'Barbara Broccoli'],
 'Starring': ['Daniel Craig',
  'Christoph Waltz',
  'Léa Seydoux',
  'Ben Whishaw',
  'Naomie Harris',
  'Dave Bautista',
  'Monica Bellucci',
  'Ralph Fiennes'],
 'Cinematography': 'Hoyte van Hoytema',
 'Edited by': 'Lee Smith',
 'Music by': 'Thomas Newman',
 'Production companies': ['Eon Productions',
  'Metro-Goldwyn-Mayer',
  'Columbia Pictures'],
 'Distributed by': 'Sony Pictures Releasing',
 'Release date': ['26 October 2015',
  '(United Kingdom)',
  '6 November 2015',
  '(United States)'],
 'Running time': '148 minutes',
 'Countries': ['United Kingdom', 'United States'],
 'Language': 'English',
 'Budget': '$245–300 million',
 'Box office': '$880.7 million'}

In [9]:
#extract summaries of all James Bond films one by one
result = requests.get("https://en.wikipedia.org/wiki/List_of_James_Bond_films")
soup = bs(result.content)
films = soup.select(".wikitable.plainrowheaders.sortable i a")

wiki_url = "https://en.wikipedia.org/"

film_info_list = []

for index, film in enumerate(films):
    try:
        film_url = film["href"]
        full_url = wiki_url + film_url
        title = film["title"]
    
        film_info_list.append(retrieve_info_box(full_url))
        
    except Exception as exc:
        print(film.get_text())
        print(exc)

TASK 3:
Data cleaning

In [10]:
#running time of each film
print([film["Running time"] for film in film_info_list])

['109 minutes', '115 minutes', '110 minutes', '130 minutes', '117 minutes', '142 minutes', '120 minutes', '121 minutes', '125 minutes', '125 minutes', '126 minutes', '127 minutes', '131 minutes', '131 minutes', '131 minutes', '133 minutes', '130 minutes', '119 minutes', '125 minutes', '133 minutes', '144 minutes', '106 minutes', '143 minutes', '148 minutes', '163 minutes', '131 minutes', '134 minutes']


In [11]:
#convert film running time from str to int
def runtime_to_int(running_time):
    if running_time == "N/A":
        return None
    else:
        value = int(running_time.split(" ")[0])
        return value

for film in film_info_list:
    film["Running time (numeral)"] = runtime_to_int(film.get('Running time'))

In [12]:
print([film.get("Running time (numeral)") for film in film_info_list])

[109, 115, 110, 130, 117, 142, 120, 121, 125, 125, 126, 127, 131, 131, 131, 133, 130, 119, 125, 133, 144, 106, 143, 148, 163, 131, 134]


In [13]:
#testing runtime_to_int function
film_info_list[3]

{'Film Title': 'Thunderball',
 'Directed by': 'Terence Young',
 'Screenplay by': 'Richard Maibaum John Hopkins Original Screenplay: Jack Whittingham',
 'Story by': 'Kevin McClory Jack Whittingham Ian Fleming',
 'Based on': 'Thunderball by Ian Fleming',
 'Produced by': 'Kevin McClory',
 'Starring': 'Sean Connery Claudine Auger Adolfo Celi Luciana Paluzzi Rik Van Nutter',
 'Cinematography': 'Ted Moore',
 'Edited by': 'Peter Hunt Ernest Hosler',
 'Music by': 'John Barry',
 'Production company': 'Eon Productions',
 'Distributed by': 'United Artists',
 'Release date': ['9 December 1965',
  '(Tokyo, premiere)',
  '29 December 1965',
  '(United Kingdom)'],
 'Running time': '130 minutes',
 'Countries': 'United Kingdom United States',
 'Language': 'English',
 'Budget': '$9 million',
 'Box office': '$141.2 million',
 'Running time (numeral)': 130}

In [14]:
#budget of each film
print([film["Budget"] for film in film_info_list])

['$1.1 million', '$2 million', '$3 million', '$9 million', '$9.5 million', '$7 million', '$7.2 million', '$7 million', '$7 million', '$13.5 million', '$34 million', '$28 million', '$27.5 million', '$30 million', '$40 million', '$32 million', '$60 million', '$110 million', '$135 million', '$142 million', '$150 million', '$200–230 million', '$150–200 million', '$245–300 million', '$250–301 million', '$12 million', '$36 million']


In [15]:
#box office of each film
print([film.get("Box office", "N/A") for film in film_info_list])

['$59.5 million', '$79 million', '$125 million', '$141.2 million', '$111.6 million', '$82 million', '$116 million', '$161.8 million', '$97.6 million', '$185.4 million', '$210.3 million', '$195.3 million', '$187.5 million', '$152.4 million', '$191.2 million', '$156.1 million', '$352.1 million', '$333 million', '$361.8 million', '$431.9 million', '$616.5 million', '$589.6 million', '$1.109 billion', '$880.7 million', 'N/A', '$41.7 million', '$160 million']


In [16]:
#import regular expression for converting strings to integer
import re

In [17]:
#testing converting the Budget and Box office from str to int using re
num = r'\d+\.*\d*'
print(re.search(num, "$18.7 million").group())

18.7


In [18]:
#convert budget and box office from str to flt
def money_to_int(money):
    if money == "N/A":
        return None
    else:
        value = float(re.search(num, money).group())
        return value
print(money_to_int("$877 million"))

877.0


In [19]:
for film in film_info_list:
    film["Budget (numeral)"] = money_to_int(film.get('Budget'))
    film["Box office (numeral)"] = money_to_int(film.get('Box office', "N/A"))

In [20]:
#testing money_to_int function
film_info_list[4]

{'Film Title': 'You Only Live Twice',
 'Directed by': 'Lewis Gilbert',
 'Screenplay by': 'Roald Dahl',
 'Additional story material by': ['Harold Jack Bloom'],
 'Based on': 'You Only Live Twice by Ian Fleming',
 'Produced by': 'Harry Saltzman Albert R. Broccoli',
 'Starring': 'Sean Connery',
 'Cinematography': 'Freddie Young',
 'Edited by': 'Peter R. Hunt',
 'Music by': 'John Barry',
 'Production company': 'Eon Productions',
 'Distributed by': 'United Artists',
 'Release date': ['12 June 1967',
  '(London, premiere)',
  '13 June 1967',
  '(United Kingdom)'],
 'Running time': '117 minutes',
 'Countries': 'United Kingdom United States',
 'Languages': 'English Japanese',
 'Budget': '$9.5 million',
 'Box office': '$111.6 million',
 'Running time (numeral)': 117,
 'Budget (numeral)': 9.5,
 'Box office (numeral)': 111.6}

TASK 4: Convert data into csv

In [21]:
import pandas as pd

In [22]:
df = pd.DataFrame(film_info_list)

In [23]:
df.head(3)

Unnamed: 0,Film Title,Directed by,Screenplay by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Production company,...,Budget (numeral),Box office (numeral),Adaptation by,Country,Story by,Additional story material by,Languages,Additional dialogue by,Production companies,Written by
0,Dr. No,Terence Young,Richard Maibaum Johanna Harwood Berkely Mather,Dr. No by Ian Fleming,Harry Saltzman Albert R. Broccoli,Sean Connery Ursula Andress Joseph Wiseman Jac...,Ted Moore,Peter R. Hunt,Monty Norman,Eon Productions,...,1.1,59.5,,,,,,,,
1,From Russia with Love,Terence Young,Richard Maibaum,"From Russia, with Love by Ian Fleming",Harry Saltzman Albert R. Broccoli,Sean Connery Pedro Armendáriz Lotte Lenya Robe...,Ted Moore,Peter R. Hunt,John Barry,Eon Productions,...,2.0,79.0,[Johanna Harwood],United Kingdom,,,,,,
2,Goldfinger,Guy Hamilton,"[Richard Maibaum, Paul Dehn]",Goldfinger by Ian Fleming,"[Harry Saltzman, Albert R. Broccoli]","[Sean Connery, Honor Blackman, Gert Fröbe, Shi...",Ted Moore,Peter R. Hunt,John Barry,Eon Productions,...,3.0,125.0,,,,,,,,


In [24]:
df.to_csv("bond_film.csv")

TASK 5: Simple analysis of the films

Top 3 longest James Bond films

In [25]:
film_length = df.sort_values(["Running time (numeral)"], ascending=False)
film_length.head(3)

Unnamed: 0,Film Title,Directed by,Screenplay by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Production company,...,Budget (numeral),Box office (numeral),Adaptation by,Country,Story by,Additional story material by,Languages,Additional dialogue by,Production companies,Written by
24,No Time to Die,Cary Joji Fukunaga,"[Neal Purvis, Robert Wade, Cary Joji Fukunaga,...",James Bond by Ian Fleming,"[Michael G. Wilson, Barbara Broccoli]","[Daniel Craig, Rami Malek, Léa Seydoux, Lashan...",Linus Sandgren,"[Elliot Graham, Tom Cross]",Hans Zimmer,,...,250.0,,,,"[Neal Purvis, Robert Wade, Cary Joji Fukunaga]",,,,"[Metro-Goldwyn-Mayer, Eon Productions]",
23,Spectre,Sam Mendes,"[John Logan, Neal Purvis, Robert Wade, Jez But...",James Bond by Ian Fleming,"[Michael G. Wilson, Barbara Broccoli]","[Daniel Craig, Christoph Waltz, Léa Seydoux, B...",Hoyte van Hoytema,Lee Smith,Thomas Newman,,...,245.0,880.7,,,"[John Logan, Neal Purvis, Robert Wade]",,,,"[Eon Productions, Metro-Goldwyn-Mayer, Columbi...",
20,Casino Royale,Martin Campbell,"[Neal Purvis, Robert Wade, Paul Haggis]",Casino Royale by Ian Fleming,"[Michael G. Wilson, Barbara Broccoli]","[Daniel Craig, Eva Green, Mads Mikkelsen, Jeff...",Phil Méheux,Stuart Baird,David Arnold,,...,150.0,616.5,,,,,,,"[Eon Productions, Metro-Goldwyn-Mayer Pictures...",


In [26]:
df['Profit'] = df['Box office (numeral)'] - df['Budget (numeral)']

Top 3 grossing James Bond films

In [27]:
profit = df.sort_values(['Profit'], ascending=False)
profit.head(3)

Unnamed: 0,Film Title,Directed by,Screenplay by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Production company,...,Box office (numeral),Adaptation by,Country,Story by,Additional story material by,Languages,Additional dialogue by,Production companies,Written by,Profit
23,Spectre,Sam Mendes,"[John Logan, Neal Purvis, Robert Wade, Jez But...",James Bond by Ian Fleming,"[Michael G. Wilson, Barbara Broccoli]","[Daniel Craig, Christoph Waltz, Léa Seydoux, B...",Hoyte van Hoytema,Lee Smith,Thomas Newman,,...,880.7,,,"[John Logan, Neal Purvis, Robert Wade]",,,,"[Eon Productions, Metro-Goldwyn-Mayer, Columbi...",,635.7
20,Casino Royale,Martin Campbell,"[Neal Purvis, Robert Wade, Paul Haggis]",Casino Royale by Ian Fleming,"[Michael G. Wilson, Barbara Broccoli]","[Daniel Craig, Eva Green, Mads Mikkelsen, Jeff...",Phil Méheux,Stuart Baird,David Arnold,,...,616.5,,,,,,,"[Eon Productions, Metro-Goldwyn-Mayer Pictures...",,466.5
21,Quantum of Solace,Marc Forster,,James Bond by Ian Fleming,"[Michael G. Wilson, Barbara Broccoli]","[Daniel Craig, Olga Kurylenko, Mathieu Amalric...",Roberto Schaefer,"[Matt Chesse, Richard Pearson]",David Arnold,,...,589.6,,,,,,,Eon Productions Metro-Goldwyn-Mayer Pictures C...,"[Paul Haggis, Neal Purvis, Robert Wade]",389.6
