# Set-up

In [None]:
# load packages
import requests
from bs4 import BeautifulSoup

In [None]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/best-netflix-shows-and-movies-to-binge-watch-now/"

In [None]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

In [None]:
# get the HTML from the webpage
html = response.content

## Choosing a parser

### lxml

In [None]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [None]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_Netflix_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

## Finding an element containing all the data

In [None]:
# Find all div tags on the webpage containing the information we want to scrape
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})
divs

# Extracting the title, year and score of each movie

In [None]:
# The title, year and score of each movie are contained in the 'h2' tags

In [None]:
# for instance, let's explore the first div
divs[0].find("h2")

In [None]:
# Extracting all 'h2' tags
headings = [div.find("h2") for div in divs]
headings

In [None]:
# Inspecting the text inside the headings
[heading.text for heading in headings]

In [None]:
# Let's inspect one heading to see if there is a way to distinguish the info
headings[0]

## Title

In [None]:
# Let's check all heading links
[heading.find('a') for heading in headings]

In [None]:
# Obtaining the movie titles from the links
movie_names = [heading.find('a').string for heading in headings]
movie_names

## Year

In [None]:
# Filtering only the spans containing the year
[heading.find("span", class_ = 'start-year') for heading in headings]

In [None]:
# Extracting the year string
years = [heading.find("span", class_ = 'start-year').string for heading in headings]
years

In [None]:
years[0]

### Removing the brackets

In [None]:
# Updating years with stripped values
years = [year.strip('()') for year in years]
years

In [None]:
# Converting all the strings to integers
years = [int(year) for year in years]
years

## Score

In [None]:
# Filtering only the spans containing the score
[heading.find("span", class_ = 'tMeterScore') for heading in headings]

In [None]:
# Extracting the score string
scores = [heading.find("span", class_ = 'tMeterScore').string for heading in headings]
scores

In [None]:
# Removing the '%' sign with None value detected
scores = [None if s is None else s.strip('%') for s in scores]
scores

In [None]:
# Converting each score to an integer with None value identified
scores = [None if s is None else int(s) for s in scores]
scores

# Extracting the rest of the information

In [None]:
# Data left to scrape:
# - Synopsis (inside a div with class 'synopsis')  --> homework
# - Cast (inside a div with class 'cast')

# All of the above are inside the original divs we scraped

## Cast info

In [None]:
cast_info = [div.find("div", class_ = 'cast') for div in divs]
cast_info

In [None]:
cast_info[0]

In [None]:
# Each cast member's name is the string of a link
# There are multiple cast members for a movie

In [None]:
# Let's first see this in a single movie

# Obtain all the links to different cast members
cast_links = cast_info[0].find_all('a')
cast_links

In [None]:
# Extract the names from the links
cast_names = [link.string for link in cast_links]
cast_names

In [None]:
# Stitch all names together as one string

# This can be done by using the join method with a string of your choice as a separator 
# I used comma followed with a space

cast = ", ".join(cast_names)
cast

In [None]:
# Now repeat the above operations for every movie

# We can either use a for loop (clearer), or
# use a nested list compehension (more concise)

### Using a for loop

In [None]:
# Initialize the list of all cast memners
cast = []

# Just put all previous operations inside a for loop
for c in cast_info:
    cast_links = c.find_all('a')
    cast_names = [link.string for link in cast_links]
    
    cast.append(", ".join(cast_names)) # Joining is optional

cast

### Nested list comprehension

In [None]:
# As you can see this can be done in just one line using nested list comprehension
# However, the code is harder to understand, and I could only comprehend this after using the for loop

cast = [", ".join([link.string for link in c.find_all("a")]) for c in cast_info]
cast

## Synopsis

In [None]:
# The synopsis is located inside a 'div' tag with the class 'info synopsis'
synopsis = [div.find('div', class_='synopsis') for div in divs]
synopsis

In [None]:
# Inspecting the element
synopsis[0]

In [None]:
# The text is the second child
synopsis[0].contents[1]

In [None]:
# Extracting the text
synopsis_text = [syn.contents[1] for syn in synopsis]
synopsis_text

# Representing the data in structured form

In [None]:
# We will take advantage of pandas and its dataframe for data storage

In [None]:
# load the pandas package
import pandas as pd

## Creating a Data Frame

In [None]:
movies_info = pd.DataFrame()
movies_info  # Still empty, need to fill it with the info we gathered

## Populating the dataframe

In [None]:
# Populating the dataframe

movies_info["Movie Title"] = movie_names
movies_info["Year"] = years
movies_info["Score"] = scores
movies_info["Synopsis"] = synopsis_text    # Homework
movies_info["Cast"] = cast

# Let's see how it looks
movies_info

In [None]:
# Maximum column set to 'None' to display the whole text and overrides the pandas default text abbreviations
pd.set_option('display.max_colwidth', None)
movies_info

## Exporting the data to CSV (comma-separated values) file

In [None]:
# Write data to CSV file
movies_info.to_csv("movies_info.csv", index = False, header = True)

In [None]:
# Index is set to False so that the index (0,1,2...) of each movie is not saved to the file (the index is purely internal)
# The header is set to True, so that the names of the columns are saved