# Set-up

In [1]:
# load packages
import requests
from bs4 import BeautifulSoup

In [2]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now"

In [3]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [4]:
# get the HTML from the webpage
html = response.content

## Choosing a parser

### html.parser

In [5]:
# convert the HTML to a Beautiful Soup object
soup = BeautifulSoup(html, 'html.parser')

In [6]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_HTML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [7]:
# When inspecting the file we see that HTML element is closed at the begining -- it parsed incorrectly!
# Let's check another parser

### lxml

In [8]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [9]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [10]:
# By first accounts of inspecting the file everything seems fine

### A word of caution

In [11]:
# Beautiful Soup ranks the lxml parser as the best one.

# If a parser is not explicitly stated in the Beautiful Soup constructor,
# the best one available on the current machine is chosen.

# This means that the same piece of code can give different results on different computers.

# Obtining the element contaning all the data

In [12]:
divs = soup.find_all('div',{'class' : 'col-sm-18 col-full-xs countdown-item-content'})
#divs

# Extracting the title and year of each movie

In [13]:
headings = [div.find('h2') for div in divs]
#headings

In [14]:
headings[0]

<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">59%</span><span class="span-icon-audience"><img class="icon-audience" height="16" src="https://images.fandango.com/cms/assets/cf921970-d359-11ea-a15f-bdf29fa24277--spilled.png" width="16"/></span><span class="tMeterScore">58%</span></h2>

# Title

In [15]:
movie_names = [heading.find('a').string for heading in headings]
#movie_names

# Year

In [16]:
movie_years = [heading.find('span', class_ = 'subtle start-year').string for heading in headings]
#movie_years

# Removing the brackets

In [17]:
movie_years = [year.strip('()') for year in movie_years]
#movie_years

In [18]:
movie_years = [int(year) for year in movie_years]
#movie_years

# Score "TOMATOMETER"

In [19]:
movie_st = [heading.find('span', class_ = 'tMeterScore').string for heading in headings]
#movie_st

# Score "AUDIENCE"

In [20]:
movie_sa = [heading.find('span', class_ = 'tMeterScore').find_next("span").find_next("span").string for heading in headings]
#movie_sa

# Directors

In [21]:
directors = [div.find("div" , class_ = 'info director') for div in divs]
#directors

In [22]:
directors_list = [director.find('a').string for director in directors]
#directors_list

# Starring

In [23]:
info_cast = [div.find("div", class_ = "info cast") for div in divs]
info_cast[0]

<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/gregory_hines">Gregory Hines</a>, <a class="" href="//www.rottentomatoes.com/celebrity/billy_crystal">Billy Crystal</a>, <a class="" href="//www.rottentomatoes.com/celebrity/jimmy_smits">Jimmy Smits</a>, <a class="" href="//www.rottentomatoes.com/celebrity/steven_bauer">Steven Bauer</a></div>

In [24]:
cast_links = info_cast[0].find_all('a')
cast_links

[<a class="" href="//www.rottentomatoes.com/celebrity/gregory_hines">Gregory Hines</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/billy_crystal">Billy Crystal</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/jimmy_smits">Jimmy Smits</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/steven_bauer">Steven Bauer</a>]

In [25]:
cast_names = [link.string for link in cast_links]
cast_names

['Gregory Hines', 'Billy Crystal', 'Jimmy Smits', 'Steven Bauer']

In [26]:
cast = ", ".join(cast_names)
cast

'Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer'

# Using for loop

In [27]:
cast =[]

for c in info_cast :
    cast_links = c.find_all('a')
    cast_names = [link.string for link in cast_links]
    result = ", ".join(cast_names)
    
    cast.append(result)
    
cast

['Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 'Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 'Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man-yuk, Donnie Yen',
 'Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 'Denzel Washington, Chris Pine, Rosario Dawson, Kevin Dunn',
 'Richard Roundtree, Moses Gunn, Christopher St. John, Charles Cioffi',
 'Kim Ok-bin, Shin Ha-kyun, Sung-joon, Kim Seo-hyung',
 'Christopher Lambert, Sean Connery, Roxanne Hart, Clancy Brown',
 'Bruce Willis, Bonnie Bedelia, William Atherton, Reginald VelJohnson',
 'Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean',
 'Tony Jaa, Johnny Nguyen, Nathan Jones, Petchtai Wongkamlao',
 'Matilda Lutz, Kevin Janssens, Vincent Colombe, Guillaume Bouchède',
 'Carlos Gallardo, Consuelo Gómez, Reinol Martinez, Peter Marquardt',
 'Feng Hsu, Chun Shih, Pai Ying, Roy Chiao',
 'Tom Cruise, Kelly McGillis, Anthony Edwards, Val Kilmer',
 'Nicolas Cage, John Cusack, John Malkovich, Steve Buscemi',


# Nested list comprehension

In [28]:
cast = [", ".join([link.string for link in c.find_all('a')]) for c in info_cast]
cast

['Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 'Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 'Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man-yuk, Donnie Yen',
 'Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 'Denzel Washington, Chris Pine, Rosario Dawson, Kevin Dunn',
 'Richard Roundtree, Moses Gunn, Christopher St. John, Charles Cioffi',
 'Kim Ok-bin, Shin Ha-kyun, Sung-joon, Kim Seo-hyung',
 'Christopher Lambert, Sean Connery, Roxanne Hart, Clancy Brown',
 'Bruce Willis, Bonnie Bedelia, William Atherton, Reginald VelJohnson',
 'Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean',
 'Tony Jaa, Johnny Nguyen, Nathan Jones, Petchtai Wongkamlao',
 'Matilda Lutz, Kevin Janssens, Vincent Colombe, Guillaume Bouchède',
 'Carlos Gallardo, Consuelo Gómez, Reinol Martinez, Peter Marquardt',
 'Feng Hsu, Chun Shih, Pai Ying, Roy Chiao',
 'Tom Cruise, Kelly McGillis, Anthony Edwards, Val Kilmer',
 'Nicolas Cage, John Cusack, John Malkovich, Steve Buscemi',


# Representing the data in structured form

In [29]:
import pandas as pd

# Creating Data Frame

In [30]:
movies_info = pd.DataFrame()

movies_info["Movie Title"] = movie_names
movies_info["Year"] = movie_years
movies_info["Score TOMATOMETER"] = movie_st
movies_info["Score AUDIENCE"] = movie_sa
movies_info["Directors"] = directors_list
movies_info["Starring"] = cast

movies_info

Unnamed: 0,Movie Title,Year,Score TOMATOMETER,Score AUDIENCE,Directors,Starring
0,Running Scared,1986,59%,58%,Peter Hyams,"Gregory Hines, Billy Crystal, Jimmy Smits, Ste..."
1,Equilibrium,2002,40%,81%,Kurt Wimmer,"Christian Bale, Emily Watson, Taye Diggs, Angu..."
2,Hero,2002,94%,87%,Zhang Yimou,"Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man..."
3,Road House,1989,39%,66%,Rowdy Herrington,"Patrick Swayze, Kelly Lynch, Sam Elliott, Ben ..."
4,Unstoppable,2010,87%,72%,Tony Scott,"Denzel Washington, Chris Pine, Rosario Dawson,..."
...,...,...,...,...,...,...
135,Hard-Boiled,1992,94%,92%,John Woo,"Chow Yun-Fat, Bowie Lam, Philip Chan, Tony Leu..."
136,The Matrix,1999,88%,85%,Andy Wachowski,"Keanu Reeves, Laurence Fishburne, Carrie-Anne ..."
137,Terminator 2: Judgment Day,1991,93%,95%,James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Edward ..."
138,Die Hard,1988,94%,94%,John McTiernan,"Bruce Willis, Alan Rickman, Bonnie Bedelia, Re..."


In [31]:
# movies_info.to_csv('movies_dataset.csv',index=False)

In [32]:
all_data = pd.read_csv("movies_dataset.csv")
all_data.head()

Unnamed: 0,Movie Title,Year,Score TOMATOMETER,Score AUDIENCE,Directors,Starring
0,Running Scared,1986,59%,58%,Peter Hyams,"Gregory Hines, Billy Crystal, Jimmy Smits, Ste..."
1,Equilibrium,2002,40%,81%,Kurt Wimmer,"Christian Bale, Emily Watson, Taye Diggs, Angu..."
2,Hero,2002,94%,87%,Zhang Yimou,"Jet Li, Tony Leung Chiu Wai, Maggie Cheung Man..."
3,Road House,1989,39%,66%,Rowdy Herrington,"Patrick Swayze, Kelly Lynch, Sam Elliott, Ben ..."
4,Unstoppable,2010,87%,72%,Tony Scott,"Denzel Washington, Chris Pine, Rosario Dawson,..."


In [33]:
all_data.isnull().sum()

Movie Title          0
Year                 0
Score TOMATOMETER    0
Score AUDIENCE       0
Directors            0
Starring             0
dtype: int64

## The top 10 movies based on Score TOMATOMETER ? 

In [34]:
all_data['Score TOMATOMETER'] = [st.strip('%') for st in all_data['Score TOMATOMETER']]
all_data['Score TOMATOMETER'] = all_data['Score TOMATOMETER'].astype('int')

In [35]:
all_data['Score TOMATOMETER']

0      59
1      40
2      94
3      39
4      87
       ..
135    94
136    88
137    93
138    94
139    97
Name: Score TOMATOMETER, Length: 140, dtype: int32

In [36]:
all_data.nlargest(10, ['Score TOMATOMETER'])

Unnamed: 0,Movie Title,Year,Score TOMATOMETER,Score AUDIENCE,Directors,Starring
111,Fist of Legend,1994,100,92%,Gordon Chan,"Jet Li, Chin Siu Ho, Billy Chow, Yasuaki Kurata"
118,The Terminator,1984,100,89%,James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Michael..."
78,Goldfinger,1964,99,89%,Guy Hamilton,"Sean Connery, Gert Frobe, Honor Blackman, Shir..."
36,Bullitt,1968,98,85%,Peter Yates,"Steve McQueen, Robert Vaughn, Jacqueline Bisse..."
112,The Killer,1989,98,92%,John Woo,"Chow Yun-Fat, Danny Lee, Sally Yeh, Kenneth Tsang"
134,Aliens,1986,98,94%,James Cameron,"Sigourney Weaver, Carrie Henn, Michael Biehn, ..."
13,A Touch of Zen,1971,97,83%,King Hu,"Feng Hsu, Chun Shih, Pai Ying, Roy Chiao"
125,"Crouching Tiger, Hidden Dragon",2000,97,86%,Ang Lee,"Chow Yun-Fat, Michelle Yeoh, Zhang Ziyi, Chen ..."
132,Mission: Impossible -- Fallout,2018,97,88%,Christopher McQuarrie,"Tom Cruise, Henry Cavill, Ving Rhames, Simon Pegg"
139,Mad Max: Fury Road,2015,97,86%,George Miller,"Tom Hardy, Charlize Theron, Nicholas Hoult, Hu..."


## The top 10 movies based on Score Score AUDIENCE	 ? 

In [37]:
all_data['Score AUDIENCE'] = [st.strip('%') for st in all_data['Score AUDIENCE']]
all_data['Score AUDIENCE'] = all_data['Score AUDIENCE'].astype('int')

In [38]:
all_data.nlargest(10, ['Score AUDIENCE'])

Unnamed: 0,Movie Title,Year,Score TOMATOMETER,Score AUDIENCE,Directors,Starring
133,Raiders of the Lost Ark,1981,96,96,Steven Spielberg,"Harrison Ford, Karen Allen, Paul Freeman, Wolf..."
121,The Professional,1994,74,95,Luc Besson,"Jean Reno, Gary Oldman, Natalie Portman, Danny..."
137,Terminator 2: Judgment Day,1991,93,95,James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Edward ..."
72,Indiana Jones and the Last Crusade,1989,88,94,Steven Spielberg,"Harrison Ford, Sean Connery, Denholm Elliott, ..."
83,Oldboy,2003,82,94,Park Chan-wook,"Choi Min-sik, Yoo Ji-tae, Kang Hye-jung, Ji Da..."
102,The Dark Knight,2008,94,94,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
130,Heat,1995,88,94,Michael Mann,"Al Pacino, Robert De Niro, Val Kilmer, Jon Voight"
134,Aliens,1986,98,94,James Cameron,"Sigourney Weaver, Carrie Henn, Michael Biehn, ..."
138,Die Hard,1988,94,94,John McTiernan,"Bruce Willis, Alan Rickman, Bonnie Bedelia, Re..."
98,The Bourne Identity,2002,84,93,Doug Liman,"Matt Damon, Franka Potente, Chris Cooper, Cliv..."


## The worst movie in Score TOMATOMETER was ? 

In [39]:
all_data.nsmallest(1, ['Score TOMATOMETER'])

Unnamed: 0,Movie Title,Year,Score TOMATOMETER,Score AUDIENCE,Directors,Starring
3,Road House,1989,39,66,Rowdy Herrington,"Patrick Swayze, Kelly Lynch, Sam Elliott, Ben ..."


## The worst movie in Score AUDIENCE was ? 

In [40]:
all_data.nsmallest(1, ['Score AUDIENCE'])

Unnamed: 0,Movie Title,Year,Score TOMATOMETER,Score AUDIENCE,Directors,Starring
28,Code of Silence,1985,70,51,Andrew Davis,"Chuck Norris, Henry Silva, Bert Remsen, Mike G..."


## What is the year , they have published movies most ? 

In [41]:
all_data['Year'].value_counts()

2017    7
2014    7
1986    6
1995    5
2010    5
1994    5
2004    5
1992    5
1997    5
2008    5
2003    4
2015    4
1993    4
2002    4
1985    4
1987    4
2011    4
1989    4
1971    4
1988    3
2012    3
2018    3
2000    3
1981    3
1990    3
1996    3
2006    3
2007    3
1998    3
1999    2
2005    2
1991    2
1984    2
1964    1
1973    1
2019    1
1972    1
1976    1
1968    1
1978    1
2001    1
2016    1
1979    1
1982    1
Name: Year, dtype: int64

In [42]:
all_data['Year'].mode()

0    2014
1    2017
dtype: int64

In [43]:
all_data['Directors'].value_counts()

John Carpenter        4
John Woo              4
James Cameron         4
Doug Liman            3
Steven Spielberg      3
                     ..
Walter Hill           1
Joss Whedon           1
John Frankenheimer    1
Timur Bekmambetov     1
Andy Wachowski        1
Name: Directors, Length: 100, dtype: int64

## What is the average of Score TOMATOMETER ? 

In [44]:
all_data['Score TOMATOMETER'].describe()

count    140.000000
mean      81.307143
std       14.279278
min       39.000000
25%       70.750000
50%       86.000000
75%       93.000000
max      100.000000
Name: Score TOMATOMETER, dtype: float64

## What is the average of Score  AUDIENCE	 ? 

In [45]:
all_data['Score AUDIENCE'].describe()

count    140.000000
mean      80.942857
std        9.966687
min       51.000000
25%       76.000000
50%       83.000000
75%       88.250000
max       96.000000
Name: Score AUDIENCE, dtype: float64