# Set-up

In [98]:
# load packages
import requests
from bs4 import BeautifulSoup

In [99]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/best-movies-2022/"

In [100]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [101]:
# get the HTML from the webpage
html = response.content

## Choosing a parser

### html.parser

In [102]:
# convert the HTML to a Beautiful Soup object
soup = BeautifulSoup(html, 'html.parser')

In [103]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_HTML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [104]:
# When inspecting the file we see that HTML element is closed at the begining -- it parsed incorrectly!
# Let's check another parser

### lxml

In [105]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [106]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [107]:
# By first accounts of inspecting the file everything seems fine

### A word of caution

In [108]:
# Beautiful Soup ranks the lxml parser as the best one.

# If a parser is not explicitly stated in the Beautiful Soup constructor,
# the best one available on the current machine is chosen.

# This means that the same piece of code can give different results on different computers.

### Finding an element containing all the Movie related data

In [109]:
# Find all div tags on the webpage containing the information we want to scrape
divs = soup.find_all('div', {"class" : "col-sm-18 col-full-xs countdown-item-content"})
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/sundown_2022">Sundown</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">74%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#29</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>78219% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" dat

#### Extracting the title, year and score of each movie

In [110]:
divs[0].find('h2')

<h2><a href="https://www.rottentomatoes.com/m/sundown_2022">Sundown</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">74%</span></h2>

In [111]:
# Extracting all Headings

headings = [dv.find('h2') for dv in divs]
headings

[<h2><a href="https://www.rottentomatoes.com/m/sundown_2022">Sundown</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">74%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/dog_2022">Dog</a> <span class="subtle start-year">(2022)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">76%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/scream_2022">Scream</a> <span class="subtle start-year">(2022)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">76%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/minamata">Minamata</a> <span class="subtle start-year">(2020)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">78%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/see_for_me">See for Me</a> <span class="subtle start-y

In [112]:
[heading.text for heading in headings]

['Sundown (2021)  74%',
 'Dog (2022)  76%',
 'Scream (2022)  76%',
 'Minamata (2020)  78%',
 'See for Me (2021)  79%',
 'Fresh (2022)  81%',
 'Here Before (2021)  86%',
 'Cyrano (2021)  85%',
 'The Batman (2022)  85%',
 'I Want You Back (2022)  86%',
 'After Yang (2021)  88%',
 'What Do We See When We Look at the Sky? (2021)  88%',
 'Strawberry Mansion (2021)  88%',
 'Cow (2021)  91%',
 'Kimi (2022)  92%',
 'Compartment No. 6 (2021)  92%',
 'Catch the Fair One (2021)  93%',
 'The Pink Cloud (2021)  93%',
 'Lucy and Desi (2022)  94%',
 'The Duke (2020)  94%',
 'Ali & Ava (2021)  94%',
 'The Fallout (2021)  94%',
 'Belle (2021)  94%',
 'Lingui, The Sacred Bonds (2021)  96%',
 'Come From Away (2021)  98%',
 'Who We Are: A Chronicle of Racism in America (2021)  98%',
 'Hellbender (2021)  98%',
 'Poly Styrene: I Am a Cliché (2021)  98%',
 'Writing with Fire (2021)  100%']

In [113]:
headings[0]

<h2><a href="https://www.rottentomatoes.com/m/sundown_2022">Sundown</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">74%</span></h2>

In [114]:
# Extracting all Movie Title
[hd.find('a') for hd in headings]

[<a href="https://www.rottentomatoes.com/m/sundown_2022">Sundown</a>,
 <a href="https://www.rottentomatoes.com/m/dog_2022">Dog</a>,
 <a href="https://www.rottentomatoes.com/m/scream_2022">Scream</a>,
 <a href="https://www.rottentomatoes.com/m/minamata">Minamata</a>,
 <a href="https://www.rottentomatoes.com/m/see_for_me">See for Me</a>,
 <a href="https://www.rottentomatoes.com/m/fresh_2022">Fresh</a>,
 <a href="https://www.rottentomatoes.com/m/here_before">Here Before</a>,
 <a href="https://www.rottentomatoes.com/m/cyrano">Cyrano</a>,
 <a href="https://www.rottentomatoes.com/m/the_batman">The Batman</a>,
 <a href="https://www.rottentomatoes.com/m/i_want_you_back">I Want You Back</a>,
 <a href="https://www.rottentomatoes.com/m/after_yang">After Yang</a>,
 <a href="https://www.rottentomatoes.com/m/what_do_we_see_when_we_look_at_the_sky">What Do We See When We Look at the Sky?</a>,
 <a href="https://www.rottentomatoes.com/m/strawberry_mansion">Strawberry Mansion</a>,
 <a href="https://www.

In [115]:
movie_names = [hd.find('a').string for hd in headings]
movie_names

['Sundown',
 'Dog',
 'Scream',
 'Minamata',
 'See for Me',
 'Fresh',
 'Here Before',
 'Cyrano',
 'The Batman',
 'I Want You Back',
 'After Yang',
 'What Do We See When We Look at the Sky?',
 'Strawberry Mansion',
 'Cow',
 'Kimi',
 'Compartment No. 6',
 'Catch the Fair One',
 'The Pink Cloud',
 'Lucy and Desi',
 'The Duke',
 'Ali & Ava',
 'The Fallout',
 'Belle',
 'Lingui, The Sacred Bonds',
 'Come From Away',
 'Who We Are: A Chronicle of Racism in America',
 'Hellbender',
 'Poly Styrene: I Am a Cliché',
 'Writing with Fire']

In [116]:
# Year

movie_years = [hd.find('span', {"class" : "subtle start-year"}).string for hd in headings]
movie_years

['(2021)',
 '(2022)',
 '(2022)',
 '(2020)',
 '(2021)',
 '(2022)',
 '(2021)',
 '(2021)',
 '(2022)',
 '(2022)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2022)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2022)',
 '(2020)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2021)',
 '(2021)']

In [117]:
# Updating years with stripped brackets
years = [year.strip('()') for year in movie_years]
years

['2021',
 '2022',
 '2022',
 '2020',
 '2021',
 '2022',
 '2021',
 '2021',
 '2022',
 '2022',
 '2021',
 '2021',
 '2021',
 '2021',
 '2022',
 '2021',
 '2021',
 '2021',
 '2022',
 '2020',
 '2021',
 '2021',
 '2021',
 '2021',
 '2021',
 '2021',
 '2021',
 '2021',
 '2021']

In [118]:
# Converting all the strings to integers
years = [int(year) for year in years]
years

[2021,
 2022,
 2022,
 2020,
 2021,
 2022,
 2021,
 2021,
 2022,
 2022,
 2021,
 2021,
 2021,
 2021,
 2022,
 2021,
 2021,
 2021,
 2022,
 2020,
 2021,
 2021,
 2021,
 2021,
 2021,
 2021,
 2021,
 2021,
 2021]

In [119]:
[hd.find('span', {"class" : "tMeterScore"}) for hd in headings]

[<span class="tMeterScore">74%</span>,
 <span class="tMeterScore">76%</span>,
 <span class="tMeterScore">76%</span>,
 <span class="tMeterScore">78%</span>,
 <span class="tMeterScore">79%</span>,
 <span class="tMeterScore">81%</span>,
 <span class="tMeterScore">86%</span>,
 <span class="tMeterScore">85%</span>,
 <span class="tMeterScore">85%</span>,
 <span class="tMeterScore">86%</span>,
 <span class="tMeterScore">88%</span>,
 <span class="tMeterScore">88%</span>,
 <span class="tMeterScore">88%</span>,
 <span class="tMeterScore">91%</span>,
 <span class="tMeterScore">92%</span>,
 <span class="tMeterScore">92%</span>,
 <span class="tMeterScore">93%</span>,
 <span class="tMeterScore">93%</span>,
 <span class="tMeterScore">94%</span>,
 <span class="tMeterScore">94%</span>,
 <span class="tMeterScore">94%</span>,
 <span class="tMeterScore">94%</span>,
 <span class="tMeterScore">94%</span>,
 <span class="tMeterScore">96%</span>,
 <span class="tMeterScore">98%</span>,
 <span class="tMeterScore

In [120]:
# Score

movie_score = [hd.find('span', {"class" : "tMeterScore"}).string for hd in headings]
movie_score

['74%',
 '76%',
 '76%',
 '78%',
 '79%',
 '81%',
 '86%',
 '85%',
 '85%',
 '86%',
 '88%',
 '88%',
 '88%',
 '91%',
 '92%',
 '92%',
 '93%',
 '93%',
 '94%',
 '94%',
 '94%',
 '94%',
 '94%',
 '96%',
 '98%',
 '98%',
 '98%',
 '98%',
 '100%']

In [121]:
# Updating score with stripped %
movie_scores = [score.strip('%') for score in movie_score]
movie_scores

['74',
 '76',
 '76',
 '78',
 '79',
 '81',
 '86',
 '85',
 '85',
 '86',
 '88',
 '88',
 '88',
 '91',
 '92',
 '92',
 '93',
 '93',
 '94',
 '94',
 '94',
 '94',
 '94',
 '96',
 '98',
 '98',
 '98',
 '98',
 '100']

In [122]:
# Converting all the strings to integers
movie_scores = [int(ms) for ms in movie_scores]
movie_scores

[74,
 76,
 76,
 78,
 79,
 81,
 86,
 85,
 85,
 86,
 88,
 88,
 88,
 91,
 92,
 92,
 93,
 93,
 94,
 94,
 94,
 94,
 94,
 96,
 98,
 98,
 98,
 98,
 100]

#### Presenting Movie Data in Structured Format

In [123]:
import pandas as pd
df_movies = pd.DataFrame()

In [124]:
# Populating the dataframe

df_movies["Movie Title"] = movie_names
df_movies["Year"] = years
df_movies["Score"] = movie_scores


# Let's see how it looks
df_movies

Unnamed: 0,Movie Title,Year,Score
0,Sundown,2021,74
1,Dog,2022,76
2,Scream,2022,76
3,Minamata,2020,78
4,See for Me,2021,79
5,Fresh,2022,81
6,Here Before,2021,86
7,Cyrano,2021,85
8,The Batman,2022,85
9,I Want You Back,2022,86


#### Exporting Data to File

In [125]:
# Write data to excel file
df_movies.to_excel("movies_info_excel.xlsx", index = False, header = True)

In [126]:
# write data to CSV file
df_movies.to_csv("movies_info_csv.csv", index = False, header = True)