## Importing the libraries

In [97]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_colwidth', -1)

## Movie 1 - Interstellar 

In [2]:
url_1 = 'https://en.wikipedia.org/wiki/Interstellar_(film)' # url of the movie

In [3]:
response_1 = requests.get(url_1) # requesting the url to scrape the data
print(response_1.status_code) # printing the status code

200


In [4]:
html_content_1 = response_1.content # getting the html content
html_content_1

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Interstellar (film) - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"685c7577-09f5-4b4d-8d13-d3efe86ea656","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Interstellar_(film)","wgTitle":"Interstellar (film)","wgCurRevisionId":1059895034,"wgRevisionId":1059895034,"wgArticleId":6009939,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: bot: original URL status unknown","Articles with short description","Short description is different from Wikidata","G

In [5]:
parsed_html_1 = BeautifulSoup(html_content_1, 'lxml') # parsing the html content using lxml parser
parsed_html_1

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Interstellar (film) - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"685c7577-09f5-4b4d-8d13-d3efe86ea656","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Interstellar_(film)","wgTitle":"Interstellar (film)","wgCurRevisionId":1059895034,"wgRevisionId":1059895034,"wgArticleId":6009939,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: bot: original URL status unknown","Articles with short description","Short description is different from Wikidata","Good art

### Title 

In [6]:
title_tags_1 = parsed_html_1.find('title') # title tags of the page
title_tags_1

<title>Interstellar (film) - Wikipedia</title>

In [7]:
title_1 = title_tags_1.string # getting the string content
title_1

'Interstellar (film) - Wikipedia'

In [8]:
# we can get only the title 
# using str split()

final_title_1 = title_1.split()[0] # getting the first element in the list from the splitted string
final_title_1

'Interstellar'

### Director

In [9]:
all_td_tags_1 = parsed_html_1.find_all('td', 'infobox-data')
all_td_tags_1[:5]

[<td class="infobox-data"><a href="/wiki/Christopher_Nolan" title="Christopher Nolan">Christopher Nolan</a></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/Jonathan_Nolan" title="Jonathan Nolan">Jonathan Nolan</a></li>
 <li>Christopher Nolan</li></ul>
 </div></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/Emma_Thomas" title="Emma Thomas">Emma Thomas</a></li>
 <li>Christopher Nolan</li>
 <li><a href="/wiki/Lynda_Obst" title="Lynda Obst">Lynda Obst</a></li></ul>
 </div></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/Matthew_McConaughey" title="Matthew McConaughey">Matthew McConaughey</a></li>
 <li><a href="/wiki/Anne_Hathaway" title="Anne Hathaway">Anne Hathaway</a></li>
 <li><a href="/wiki/Jessica_Chastain" title="Jessica Chastain">Jessica Chastain</a></li>
 <li><a href="/wiki/Bill_Irwin" title="Bill Irwin">Bill Irwin</a></li>
 <li><a href="/wiki/Ellen_Burstyn" title="Ellen Burstyn">Ellen B

In [10]:
# we can see that director name is present at index pos 0

final_dir_name_1 = all_td_tags_1[0].string
final_dir_name_1

'Christopher Nolan'

### Cast

In [11]:
# from all_td_tags_1 we can see that
# cast list is present at index pos 3

all_cast_1 = all_td_tags_1[3]
all_cast_1

<td class="infobox-data"><div class="plainlist">
<ul><li><a href="/wiki/Matthew_McConaughey" title="Matthew McConaughey">Matthew McConaughey</a></li>
<li><a href="/wiki/Anne_Hathaway" title="Anne Hathaway">Anne Hathaway</a></li>
<li><a href="/wiki/Jessica_Chastain" title="Jessica Chastain">Jessica Chastain</a></li>
<li><a href="/wiki/Bill_Irwin" title="Bill Irwin">Bill Irwin</a></li>
<li><a href="/wiki/Ellen_Burstyn" title="Ellen Burstyn">Ellen Burstyn</a></li>
<li><a href="/wiki/Michael_Caine" title="Michael Caine">Michael Caine</a></li></ul>
</div></td>

In [12]:
final_cast_1 = [] # empty list
for li_tag in all_cast_1.find_all('li'): # finding all the li tags 
    final_cast_1.append(li_tag.text) # getting the text from the tag
    
print(final_cast_1)

['Matthew McConaughey', 'Anne Hathaway', 'Jessica Chastain', 'Bill Irwin', 'Ellen Burstyn', 'Michael Caine']


### Year of release

In [13]:
# from all_td_tags_1 we can see find that
# release dates are present at index pos 9

release_1 = all_td_tags_1[-6] # using negetive index
release_1

<td class="infobox-data"><div class="plainlist">
<ul><li>October 26, 2014<span style="display:none"> (<span class="bday dtstart published updated">2014-10-26</span>)</span> (<a class="mw-redirect" href="/wiki/TCL_Chinese_Theatre" title="TCL Chinese Theatre">TCL Chinese Theatre</a>)</li>
<li>November 5, 2014<span style="display:none"> (<span class="bday dtstart published updated">2014-11-05</span>)</span> (United States)</li>
<li>November 7, 2014<span style="display:none"> (<span class="bday dtstart published updated">2014-11-07</span>)</span> (United Kingdom)</li></ul>
</div></td>

In [14]:
# since there are multiple dates
# choosing the release date of US

release_dates_1 = release_1.text.split() # getting the text from paresed html and splitting the string 
release_dates_1

['October',
 '26,',
 '2014',
 '(2014-10-26)',
 '(TCL',
 'Chinese',
 'Theatre)',
 'November',
 '5,',
 '2014',
 '(2014-11-05)',
 '(United',
 'States)',
 'November',
 '7,',
 '2014',
 '(2014-11-07)',
 '(United',
 'Kingdom)']

In [15]:
final_release_year_1 = int(release_dates_1[9]) # getting the year present at index pos 9
final_release_year_1

2014

### Gross Earnings

In [16]:
# from all_td_tags we can see that
# gross earnings is present at the last index pos

gross_earnings_1 = all_td_tags_1[-1].text
gross_earnings_1

'$701.8 million[3]'

In [17]:
# we can split to get the numeric part

final_gross_earnings_1 = float(gross_earnings_1.split()[0][1:]) # starting from index 1 to exclude the $ symbol
final_gross_earnings_1

701.8

In [18]:
movie_1 = {'movie_name' : final_title_1, 'director_name' : final_dir_name_1, 'cast' : final_cast_1,
          'year_of_release' : final_release_year_1, 'total_earnings' : final_gross_earnings_1}
movie_1

{'cast': ['Matthew McConaughey',
  'Anne Hathaway',
  'Jessica Chastain',
  'Bill Irwin',
  'Ellen Burstyn',
  'Michael Caine'],
 'director_name': 'Christopher Nolan',
 'movie_name': 'Interstellar',
 'total_earnings': 701.8,
 'year_of_release': 2014}

## Movie 2 - Kill Bill Volume 1 

In [19]:
url_2 = 'https://en.wikipedia.org/wiki/Kill_Bill:_Volume_1' # url of the movie

In [20]:
response_2 = requests.get(url_2) # requesting the url to scrape the data
print(response_2.status_code) # printing the status code

200


In [21]:
html_content_2 = response_2.content # getting the html content
html_content_2

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Kill Bill: Volume 1 - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"c3f44417-532f-4fbe-89d9-c44f4a039907","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Kill_Bill:_Volume_1","wgTitle":"Kill Bill: Volume 1","wgCurRevisionId":1060713946,"wgRevisionId":1060713946,"wgArticleId":301314,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with non-numeric formatnum arguments","Articles with short description","Short description matches Wikidata","Use American 

In [22]:
parsed_html_2 = BeautifulSoup(html_content_2, 'lxml') # parsing the html content using lxml parser
parsed_html_2

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Kill Bill: Volume 1 - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"c3f44417-532f-4fbe-89d9-c44f4a039907","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Kill_Bill:_Volume_1","wgTitle":"Kill Bill: Volume 1","wgCurRevisionId":1060713946,"wgRevisionId":1060713946,"wgArticleId":301314,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with non-numeric formatnum arguments","Articles with short description","Short description matches Wikidata","Use American English

### Title 

In [23]:
title_tags_2 = parsed_html_2.find('title') # title tags of the page
title_tags_2

<title>Kill Bill: Volume 1 - Wikipedia</title>

In [24]:
title_2 = title_tags_2.string # getting the string content
title_2

'Kill Bill: Volume 1 - Wikipedia'

In [25]:
# we can get only the title 
# using str split()

final_title_2 = title_2.split(' -')[0] # getting the first element in the list from the splitted string
final_title_2

'Kill Bill: Volume 1'

### Director

In [26]:
all_td_tags_2 = parsed_html_2.find_all('td', 'infobox-data')
all_td_tags_2[:5]

[<td class="infobox-data"><a href="/wiki/Quentin_Tarantino" title="Quentin Tarantino">Quentin Tarantino</a></td>,
 <td class="infobox-data">Quentin Tarantino</td>,
 <td class="infobox-data"><div><a href="/wiki/The_Bride_(Kill_Bill)" title="The Bride (Kill Bill)">The Bride</a><br/>by <div class="plainlist" style="display: inline"><ul style="display: inline"><li style="display: inline">Quentin Tarantino</li><li><a href="/wiki/Uma_Thurman" title="Uma Thurman">Uma Thurman</a></li></ul></div></div></td>,
 <td class="infobox-data"><a href="/wiki/Lawrence_Bender" title="Lawrence Bender">Lawrence Bender</a></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li>Uma Thurman</li>
 <li><a href="/wiki/Lucy_Liu" title="Lucy Liu">Lucy Liu</a></li>
 <li><a href="/wiki/Vivica_A._Fox" title="Vivica A. Fox">Vivica A. Fox</a></li>
 <li><a href="/wiki/Michael_Madsen" title="Michael Madsen">Michael Madsen</a></li>
 <li><a href="/wiki/Daryl_Hannah" title="Daryl Hannah">Daryl Hannah</a></li>
 <li><

In [27]:
# we can see that director name is present at index pos 0

final_dir_name_2 = all_td_tags_2[0].string
final_dir_name_2

'Quentin Tarantino'

### Cast

In [28]:
# from all_td_tags_1 we can see that
# cast list is present at index pos 4

all_cast_2 = all_td_tags_2[4]
all_cast_2

<td class="infobox-data"><div class="plainlist">
<ul><li>Uma Thurman</li>
<li><a href="/wiki/Lucy_Liu" title="Lucy Liu">Lucy Liu</a></li>
<li><a href="/wiki/Vivica_A._Fox" title="Vivica A. Fox">Vivica A. Fox</a></li>
<li><a href="/wiki/Michael_Madsen" title="Michael Madsen">Michael Madsen</a></li>
<li><a href="/wiki/Daryl_Hannah" title="Daryl Hannah">Daryl Hannah</a></li>
<li><a href="/wiki/David_Carradine" title="David Carradine">David Carradine</a></li>
<li><a href="/wiki/Sonny_Chiba" title="Sonny Chiba">Sonny Chiba</a></li>
<li><a href="/wiki/Julie_Dreyfus" title="Julie Dreyfus">Julie Dreyfus</a></li>
<li><a href="/wiki/Chiaki_Kuriyama" title="Chiaki Kuriyama">Chiaki Kuriyama</a></li>
<li><a href="/wiki/Gordon_Liu" title="Gordon Liu">Gordon Liu</a></li>
<li><a href="/wiki/Michael_Parks" title="Michael Parks">Michael Parks</a></li></ul>
</div></td>

In [29]:
final_cast_2 = [] # empty list
for li_tag in all_cast_2.find_all('li'): # finding all the li tags 
    final_cast_2.append(li_tag.text) # getting the text from the tag
print(final_cast_2)

['Uma Thurman', 'Lucy Liu', 'Vivica A. Fox', 'Michael Madsen', 'Daryl Hannah', 'David Carradine', 'Sonny Chiba', 'Julie Dreyfus', 'Chiaki Kuriyama', 'Gordon Liu', 'Michael Parks']


### Year of Release

In [30]:
# from all_td_tags_1 we can see find that
# release dates are present at index pos 9

release_2 = all_td_tags_2[-6] # using negetive index
release_2

<td class="infobox-data"><div class="plainlist">
<ul><li>October 10, 2003<span style="display:none"> (<span class="bday dtstart published updated">2003-10-10</span>)</span></li></ul>
</div></td>

In [31]:
# since there are multiple dates
# choosing the release date of US

release_dates_2 = release_2.text.split() # getting the text from paresed html and splitting the string 
release_dates_2

['October', '10,', '2003', '(2003-10-10)']

In [32]:
final_release_year_2 = int(release_dates_2[2]) # getting the year present at index pos 2
final_release_year_2

2003

### Gross Earnings

In [33]:
# from all_td_tags we can see that
# gross earnings is present at the last index pos

gross_earnings_2 = all_td_tags_2[-1].text
gross_earnings_2

'$180.9 million[2]'

In [34]:
# we can split to get the numeric part

final_gross_earnings_2 = float(gross_earnings_2.split()[0][1:]) # starting from index 1 to exclude the $ symbol
final_gross_earnings_2

180.9

In [35]:
movie_2 = {'movie_name' : final_title_2, 'director_name' : final_dir_name_2, 'cast' : final_cast_2,
          'year_of_release' : final_release_year_2, 'total_earnings' : final_gross_earnings_2}
movie_2

{'cast': ['Uma Thurman',
  'Lucy Liu',
  'Vivica A. Fox',
  'Michael Madsen',
  'Daryl Hannah',
  'David Carradine',
  'Sonny Chiba',
  'Julie Dreyfus',
  'Chiaki Kuriyama',
  'Gordon Liu',
  'Michael Parks'],
 'director_name': 'Quentin Tarantino',
 'movie_name': 'Kill Bill: Volume 1',
 'total_earnings': 180.9,
 'year_of_release': 2003}

## Movie 3 - Arrival

In [36]:
url_3 = 'https://en.wikipedia.org/wiki/Arrival_(film)' # url of the movie

In [37]:
response_3 = requests.get(url_3) # requesting the url to scrape the data
print(response_3.status_code) # printing the status code

200


In [38]:
html_content_3 = response_3.content # getting the html content
html_content_3

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Arrival (film) - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"36e32352-8ae3-42c6-b43d-146852c6daa2","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Arrival_(film)","wgTitle":"Arrival (film)","wgCurRevisionId":1058934083,"wgRevisionId":1058934083,"wgArticleId":43991244,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 French-language sources (fr)","CS1 Russian-language sources (ru)","All articles lacking reliable references","Articles lacking reliable re

In [39]:
parsed_html_3 = BeautifulSoup(html_content_3, 'lxml') # parsing the html content using lxml parser
parsed_html_3

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Arrival (film) - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"36e32352-8ae3-42c6-b43d-146852c6daa2","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Arrival_(film)","wgTitle":"Arrival (film)","wgCurRevisionId":1058934083,"wgRevisionId":1058934083,"wgArticleId":43991244,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 French-language sources (fr)","CS1 Russian-language sources (ru)","All articles lacking reliable references","Articles lacking reliable reference

### Title

In [40]:
title_tags_3 = parsed_html_3.find('title') # title tags of the page
title_tags_3

<title>Arrival (film) - Wikipedia</title>

In [41]:
title_3 = title_tags_3.string # getting the string content
title_3

'Arrival (film) - Wikipedia'

In [42]:
# we can get only the title 
# using str split()

final_title_3 = title_3.split()[0] # getting the first element in the list from the splitted string
final_title_3

'Arrival'

### Director

In [43]:
all_td_tags_3 = parsed_html_3.find_all('td', 'infobox-data')
all_td_tags_3[:5]

[<td class="infobox-data"><a href="/wiki/Denis_Villeneuve" title="Denis Villeneuve">Denis Villeneuve</a></td>,
 <td class="infobox-data"><a href="/wiki/Eric_Heisserer" title="Eric Heisserer">Eric Heisserer</a></td>,
 <td class="infobox-data">"<a href="/wiki/Story_of_Your_Life" title="Story of Your Life">Story of Your Life</a>"<br/>by <a href="/wiki/Ted_Chiang" title="Ted Chiang">Ted Chiang</a></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/Shawn_Levy" title="Shawn Levy">Shawn Levy</a></li>
 <li><a href="/wiki/Dan_Levine" title="Dan Levine">Dan Levine</a></li>
 <li><a href="/wiki/Aaron_Ryder" title="Aaron Ryder">Aaron Ryder</a></li>
 <li><a href="/wiki/David_Linde" title="David Linde">David Linde</a></li></ul>
 </div></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/Amy_Adams" title="Amy Adams">Amy Adams</a></li>
 <li><a href="/wiki/Jeremy_Renner" title="Jeremy Renner">Jeremy Renner</a></li>
 <li><a href="/wiki/Forest_Whita

In [44]:
# we can see that director name is present at index pos 0

final_dir_name_3 = all_td_tags_3[0].string
final_dir_name_3

'Denis Villeneuve'

### Cast

In [45]:
# from all_td_tags_1 we can see that
# cast list is present at index pos 4

all_cast_3 = all_td_tags_3[4]
all_cast_3

<td class="infobox-data"><div class="plainlist">
<ul><li><a href="/wiki/Amy_Adams" title="Amy Adams">Amy Adams</a></li>
<li><a href="/wiki/Jeremy_Renner" title="Jeremy Renner">Jeremy Renner</a></li>
<li><a href="/wiki/Forest_Whitaker" title="Forest Whitaker">Forest Whitaker</a></li>
<li><a href="/wiki/Michael_Stuhlbarg" title="Michael Stuhlbarg">Michael Stuhlbarg</a></li>
<li><a href="/wiki/Tzi_Ma" title="Tzi Ma">Tzi Ma</a></li></ul>
</div></td>

In [46]:
final_cast_3 = [] # empty list

for li_tag in all_cast_3.find_all('li'): # finding all the li tags 
    final_cast_3.append(li_tag.text) # getting the text from the tag
print(final_cast_3)

['Amy Adams', 'Jeremy Renner', 'Forest Whitaker', 'Michael Stuhlbarg', 'Tzi Ma']


### Year of Release

In [47]:
# from all_td_tags_1 we can see find that
# release dates are present at index pos 9

release_3 = all_td_tags_3[-6] # using negetive index
release_3

<td class="infobox-data"><div class="plainlist">
<ul><li>September 1, 2016<span style="display:none"> (<span class="bday dtstart published updated">2016-09-01</span>)</span> (<a href="/wiki/73rd_Venice_International_Film_Festival" title="73rd Venice International Film Festival">Venice</a>)</li>
<li>November 11, 2016<span style="display:none"> (<span class="bday dtstart published updated">2016-11-11</span>)</span> (United States)</li></ul>
</div></td>

In [48]:
# since there are multiple dates
# choosing the release date of US

release_dates_3 = release_3.text.split() # getting the text from paresed html and splitting the string 
release_dates_3

['September',
 '1,',
 '2016',
 '(2016-09-01)',
 '(Venice)',
 'November',
 '11,',
 '2016',
 '(2016-11-11)',
 '(United',
 'States)']

In [49]:
final_release_year_3 = int(release_dates_3[-4]) # getting the year present at index pos 7
final_release_year_3

2016

### Gross Earnings

In [50]:
# from all_td_tags we can see that
# gross earnings is present at the last index pos

gross_earnings_3 = all_td_tags_3[-1].text
gross_earnings_3

'$203.4 million[4]'

In [51]:
# we can split to get the numeric part

final_gross_earnings_3 = float(gross_earnings_3.split()[0][1:]) # starting from index 1 to exclude the $ symbol
final_gross_earnings_3

203.4

In [52]:
movie_3 = {'movie_name' : final_title_3, 'director_name' : final_dir_name_3, 'cast' : final_cast_3,
          'year_of_release' : final_release_year_3, 'total_earnings' : final_gross_earnings_3}
movie_3

{'cast': ['Amy Adams',
  'Jeremy Renner',
  'Forest Whitaker',
  'Michael Stuhlbarg',
  'Tzi Ma'],
 'director_name': 'Denis Villeneuve',
 'movie_name': 'Arrival',
 'total_earnings': 203.4,
 'year_of_release': 2016}

## Movie 4 - Schindler's List

In [53]:
url_4 = 'https://en.wikipedia.org/wiki/Schindler%27s_List' # url of the movie

In [54]:
response_4 = requests.get(url_4) # requesting the url to scrape the data
print(response_4.status_code) # printing the status code

200


In [55]:
html_content_4 = response_4.content # getting the html content
html_content_4

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Schindler\'s List - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"4067fb89-3187-45bb-8694-23de643ebdb2","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Schindler\'s_List","wgTitle":"Schindler\'s List","wgCurRevisionId":1060658876,"wgRevisionId":1060658876,"wgArticleId":65834,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description matches Wikidata","Use American English from March 2021","All Wikipedia articles wri

In [56]:
parsed_html_4 = BeautifulSoup(html_content_4, 'lxml') # parsing the html content using lxml parser
parsed_html_4

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Schindler's List - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"4067fb89-3187-45bb-8694-23de643ebdb2","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Schindler's_List","wgTitle":"Schindler's List","wgCurRevisionId":1060658876,"wgRevisionId":1060658876,"wgArticleId":65834,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description matches Wikidata","Use American English from March 2021","All Wikipedia articles written in Am

### Title

In [57]:
title_tags_4 = parsed_html_4.find('title') # title tags of the page
title_tags_4

<title>Schindler's List - Wikipedia</title>

In [58]:
title_4 = title_tags_4.string # getting the string content
title_4

"Schindler's List - Wikipedia"

In [59]:
# we can get only the title 
# using str split()

final_title_4 = title_4.split(' -')[0] # getting the first element in the list from the splitted string
final_title_4

"Schindler's List"

### Director

In [60]:
all_td_tags_4 = parsed_html_4.find_all('td', 'infobox-data')
all_td_tags_4[:5]

[<td class="infobox-data"><a href="/wiki/Steven_Spielberg" title="Steven Spielberg">Steven Spielberg</a></td>,
 <td class="infobox-data"><a href="/wiki/Steven_Zaillian" title="Steven Zaillian">Steven Zaillian</a></td>,
 <td class="infobox-data"><i><a href="/wiki/Schindler%27s_Ark" title="Schindler's Ark">Schindler's Ark</a></i><br/>by <a href="/wiki/Thomas_Keneally" title="Thomas Keneally">Thomas Keneally</a></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li>Steven Spielberg</li>
 <li><a href="/wiki/Gerald_R._Molen" title="Gerald R. Molen">Gerald R. Molen</a></li>
 <li><a href="/wiki/Branko_Lustig" title="Branko Lustig">Branko Lustig</a></li></ul>
 </div></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/Liam_Neeson" title="Liam Neeson">Liam Neeson</a></li>
 <li><a href="/wiki/Ben_Kingsley" title="Ben Kingsley">Ben Kingsley</a></li>
 <li><a href="/wiki/Ralph_Fiennes" title="Ralph Fiennes">Ralph Fiennes</a></li>
 <li><a href="/wiki/Caroline_G

In [61]:
# we can see that director name is present at index pos 0

final_dir_name_4 = all_td_tags_4[0].string
final_dir_name_4

'Steven Spielberg'

### Cast

In [62]:
# from all_td_tags_1 we can see that
# cast list is present at index pos 4

all_cast_4 = all_td_tags_4[4]
all_cast_4

<td class="infobox-data"><div class="plainlist">
<ul><li><a href="/wiki/Liam_Neeson" title="Liam Neeson">Liam Neeson</a></li>
<li><a href="/wiki/Ben_Kingsley" title="Ben Kingsley">Ben Kingsley</a></li>
<li><a href="/wiki/Ralph_Fiennes" title="Ralph Fiennes">Ralph Fiennes</a></li>
<li><a href="/wiki/Caroline_Goodall" title="Caroline Goodall">Caroline Goodall</a></li>
<li><a href="/wiki/Jonathan_Sagall" title="Jonathan Sagall">Jonathan Sagall</a></li>
<li><a href="/wiki/Embeth_Davidtz" title="Embeth Davidtz">Embeth Davidtz</a></li></ul>
</div></td>

In [63]:
final_cast_4 = [] # empty list
for li_tag in all_cast_4.find_all('li'): # finding all the li tags 
    final_cast_4.append(li_tag.text) # getting the text from the tag
    
print(final_cast_4)

['Liam Neeson', 'Ben Kingsley', 'Ralph Fiennes', 'Caroline Goodall', 'Jonathan Sagall', 'Embeth Davidtz']


### Year of Release

In [64]:
# from all_td_tags_1 we can see find that
# release dates are present at index pos 10

release_4 = all_td_tags_4[10] # using negetive index
release_4

<td class="infobox-data"><div class="plainlist">
<ul><li>November 30, 1993<span style="display:none"> (<span class="bday dtstart published updated">1993-11-30</span>)</span> (<a href="/wiki/Washington,_D.C." title="Washington, D.C.">Washington, D.C.</a>)</li>
<li>December 15, 1993<span style="display:none"> (<span class="bday dtstart published updated">1993-12-15</span>)</span> (United States)</li></ul>
</div></td>

In [65]:
# since there are multiple dates
# choosing the release date of US

release_dates_4 = release_4.text.split() # getting the text from paresed html and splitting the string 
release_dates_4

['November',
 '30,',
 '1993',
 '(1993-11-30)',
 '(Washington,',
 'D.C.)',
 'December',
 '15,',
 '1993',
 '(1993-12-15)',
 '(United',
 'States)']

In [66]:
final_release_year_4 = int(release_dates_4[-4]) # getting the year present at index pos 8
final_release_year_4

1993

### Gross Earnings

In [67]:
# from all_td_tags we can see that
# gross earnings is present at the last index pos

gross_earnings_4 = all_td_tags_4[-1].text
gross_earnings_4

'$322.2\xa0million[3]'

In [68]:
# we can split to get the numeric part

final_gross_earnings_4 = float(gross_earnings_4.split()[0][1:]) # starting from index 1 to exclude the $ symbol
final_gross_earnings_4

322.2

In [69]:
movie_4 = {'movie_name' : final_title_4, 'director_name' : final_dir_name_4, 'cast' : final_cast_4,
          'year_of_release' : final_release_year_4, 'total_earnings' : final_gross_earnings_4}
movie_4

{'cast': ['Liam Neeson',
  'Ben Kingsley',
  'Ralph Fiennes',
  'Caroline Goodall',
  'Jonathan Sagall',
  'Embeth Davidtz'],
 'director_name': 'Steven Spielberg',
 'movie_name': "Schindler's List",
 'total_earnings': 322.2,
 'year_of_release': 1993}

### Movie 5 - Gladiator

In [70]:
url_5 = 'https://en.wikipedia.org/wiki/Gladiator_(2000_film)' # url of the movie

In [71]:
response_5 = requests.get(url_5) # requesting the url to scrape the data
print(response_5.status_code) # printing the status code

200


In [72]:
html_content_5 = response_5.content # getting the html content
html_content_5

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Gladiator (2000 film) - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"cb485683-2220-4e2b-8bb2-db5fe46dd0aa","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Gladiator_(2000_film)","wgTitle":"Gladiator (2000 film)","wgCurRevisionId":1060696449,"wgRevisionId":1060696449,"wgArticleId":3616797,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","All articles with dead external links","Articles with dead external links from August 

In [73]:
parsed_html_5 = BeautifulSoup(html_content_5, 'lxml') # parsing the html content using lxml parser
parsed_html_5

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Gladiator (2000 film) - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"cb485683-2220-4e2b-8bb2-db5fe46dd0aa","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Gladiator_(2000_film)","wgTitle":"Gladiator (2000 film)","wgCurRevisionId":1060696449,"wgRevisionId":1060696449,"wgArticleId":3616797,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","All articles with dead external links","Articles with dead external links from August 2021","

### Title

In [74]:
title_tags_5 = parsed_html_5.find('title') # title tags of the page
title_tags_5

<title>Gladiator (2000 film) - Wikipedia</title>

In [75]:
title_5 = title_tags_5.string # getting the string content
title_5

'Gladiator (2000 film) - Wikipedia'

In [76]:
# we can get only the title 
# using str split()

final_title_5 = title_5.split()[0] # getting the first element in the list from the splitted string
final_title_5

'Gladiator'

### Director

In [77]:
all_td_tags_5 = parsed_html_5.find_all('td', 'infobox-data')
all_td_tags_5[:5]

[<td class="infobox-data"><a href="/wiki/Ridley_Scott" title="Ridley Scott">Ridley Scott</a></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/David_Franzoni" title="David Franzoni">David Franzoni</a></li>
 <li><a href="/wiki/John_Logan_(writer)" title="John Logan (writer)">John Logan</a></li>
 <li><a href="/wiki/William_Nicholson_(writer)" title="William Nicholson (writer)">William Nicholson</a></li></ul>
 </div></td>,
 <td class="infobox-data">David Franzoni</td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/Douglas_Wick" title="Douglas Wick">Douglas Wick</a></li>
 <li>David Franzoni</li>
 <li><a href="/wiki/Branko_Lustig" title="Branko Lustig">Branko Lustig</a></li></ul>
 </div></td>,
 <td class="infobox-data"><div class="plainlist">
 <ul><li><a href="/wiki/Russell_Crowe" title="Russell Crowe">Russell Crowe</a></li>
 <li><a href="/wiki/Joaquin_Phoenix" title="Joaquin Phoenix">Joaquin Phoenix</a></li>
 <li><a href="/wiki/Conn

In [78]:
# we can see that director name is present at index pos 0

final_dir_name_5 = all_td_tags_5[0].string
final_dir_name_5

'Ridley Scott'

### Cast

In [79]:
# from all_td_tags_1 we can see that
# cast list is present at index pos 4

all_cast_5 = all_td_tags_5[4]
all_cast_5

<td class="infobox-data"><div class="plainlist">
<ul><li><a href="/wiki/Russell_Crowe" title="Russell Crowe">Russell Crowe</a></li>
<li><a href="/wiki/Joaquin_Phoenix" title="Joaquin Phoenix">Joaquin Phoenix</a></li>
<li><a href="/wiki/Connie_Nielsen" title="Connie Nielsen">Connie Nielsen</a></li>
<li><a href="/wiki/Oliver_Reed" title="Oliver Reed">Oliver Reed</a></li>
<li><a href="/wiki/Derek_Jacobi" title="Derek Jacobi">Derek Jacobi</a></li>
<li><a href="/wiki/Djimon_Hounsou" title="Djimon Hounsou">Djimon Hounsou</a></li>
<li><a href="/wiki/Richard_Harris" title="Richard Harris">Richard Harris</a></li>
<li><a href="/wiki/Tommy_Flanagan_(actor)" title="Tommy Flanagan (actor)">Tommy Flanagan</a></li></ul>
</div></td>

In [80]:
final_cast_5 = [] # empty list

for li_tag in all_cast_5.find_all('li'): # finding all the li tags 
    final_cast_5.append(li_tag.text) # getting the text from the tag
    
print(final_cast_5)

['Russell Crowe', 'Joaquin Phoenix', 'Connie Nielsen', 'Oliver Reed', 'Derek Jacobi', 'Djimon Hounsou', 'Richard Harris', 'Tommy Flanagan']


### Release Year

In [81]:
# from all_td_tags_1 we can see find that
# release dates are present at index pos 9

release_5 = all_td_tags_5[-6] # using negetive index
release_5

<td class="infobox-data"><div class="plainlist">
<ul><li>May 1, 2000<span style="display:none"> (<span class="bday dtstart published updated">2000-05-01</span>)</span> (<a href="/wiki/Los_Angeles" title="Los Angeles">Los Angeles</a>)</li>
<li>May 4, 2000<span style="display:none"> (<span class="bday dtstart published updated">2000-05-04</span>)</span> (Australia)</li>
<li>May 5, 2000<span style="display:none"> (<span class="bday dtstart published updated">2000-05-05</span>)</span> (United States)</li>
<li>May 12, 2000<span style="display:none"> (<span class="bday dtstart published updated">2000-05-12</span>)</span> (United Kingdom)</li></ul>
</div></td>

In [82]:
# since there are multiple dates
# choosing the release date of US

release_dates_5 = release_5.text.split() # getting the text from paresed html and splitting the string 
release_dates_5

['May',
 '1,',
 '2000',
 '(2000-05-01)',
 '(Los',
 'Angeles)',
 'May',
 '4,',
 '2000',
 '(2000-05-04)',
 '(Australia)',
 'May',
 '5,',
 '2000',
 '(2000-05-05)',
 '(United',
 'States)',
 'May',
 '12,',
 '2000',
 '(2000-05-12)',
 '(United',
 'Kingdom)']

In [83]:
final_release_year_5 = int(release_dates_5[13]) # getting the year present at index pos 13
final_release_year_5

2000

### Gross Earnings

In [84]:
# from all_td_tags we can see that
# gross earnings is present at the last index pos

gross_earnings_5 = all_td_tags_5[-1].text
gross_earnings_5

'$460.5 million[5]'

In [85]:
# we can split to get the numeric part

final_gross_earnings_5 = float(gross_earnings_5.split()[0][1:]) # starting from index 1 to exclude the $ symbol
final_gross_earnings_5

460.5

In [86]:
movie_5 = {'movie_name' : final_title_5, 'director_name' : final_dir_name_5, 'cast' : final_cast_5,
          'year_of_release' : final_release_year_5, 'total_earnings' : final_gross_earnings_5}
movie_5

{'cast': ['Russell Crowe',
  'Joaquin Phoenix',
  'Connie Nielsen',
  'Oliver Reed',
  'Derek Jacobi',
  'Djimon Hounsou',
  'Richard Harris',
  'Tommy Flanagan'],
 'director_name': 'Ridley Scott',
 'movie_name': 'Gladiator',
 'total_earnings': 460.5,
 'year_of_release': 2000}

## Combing all the data

In [94]:
movies_df = pd.DataFrame([movie_1, movie_2, movie_3, movie_4, movie_5])
movies_df

Unnamed: 0,movie_name,director_name,cast,year_of_release,total_earnings
0,Interstellar,Christopher Nolan,"[Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Michael Caine]",2014,701.8
1,Kill Bill: Volume 1,Quentin Tarantino,"[Uma Thurman, Lucy Liu, Vivica A. Fox, Michael Madsen, Daryl Hannah, David Carradine, Sonny Chiba, Julie Dreyfus, Chiaki Kuriyama, Gordon Liu, Michael Parks]",2003,180.9
2,Arrival,Denis Villeneuve,"[Amy Adams, Jeremy Renner, Forest Whitaker, Michael Stuhlbarg, Tzi Ma]",2016,203.4
3,Schindler's List,Steven Spielberg,"[Liam Neeson, Ben Kingsley, Ralph Fiennes, Caroline Goodall, Jonathan Sagall, Embeth Davidtz]",1993,322.2
4,Gladiator,Ridley Scott,"[Russell Crowe, Joaquin Phoenix, Connie Nielsen, Oliver Reed, Derek Jacobi, Djimon Hounsou, Richard Harris, Tommy Flanagan]",2000,460.5


### Column Description

* movie_name - Name of the Movie
* director_name	- Name of the Director
* cast - List of the cast
* year_of_release - Year of the release for the movie
* total_earnings - Total earnings in US million dollars

In [88]:
movies_df.to_csv('movies_data.csv') # moving the dataframe to csv file