# 🌟 Exercise 3 : Scrape Dynamic Content from Rotten Tomatoes
Task:

-    Use Selenium to navigate to the Rotten Tomatoes Certified Fresh Movies page.
-    Extract the HTML content after it’s fully loaded.
-    Use BeautifulSoup to parse and extract the movie titles, scores, and release dates.

Instructions

-    Set up Selenium WebDriver and navigate to the Rotten Tomatoes page.
-    Extract the HTML content using driver.page_source.
-    Parse the HTML with BeautifulSoup.
-    Find and extract the desired movie information.
-    Print the extracted data.


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

In [2]:
# setup seleium webrdriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")  # Open browser in maximized mode
driver = webdriver.Chrome(options=options)

# navigate to rotten tomatoes webp
url = "https://www.rottentomatoes.com/browse/movies_at_home/critics:certified_fresh"
driver.get(url)

# wait for the content to load
wait = WebDriverWait(driver, 30)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".flex-container")))

# connection confirmation
print("accessed rotten tomatoes and content loaded")


# scroll to load more content
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_count = 0
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # # need time to load
    # time.sleep(4)

    # wait for all elements to load
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".flex-container")))
    except TimeoutException:
        print("No new content loaded.")
        break

    # if the height didnt change break out
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
    scroll_count += 1

# scroll confirmation
print(f"Scrolling completed. Scrolled {scroll_count} times.")



# extract the html content after scrolling
html_content = driver.page_source

# use beautifulsoup to parse the html
soup = BeautifulSoup(html_content, 'html.parser')

# find all movie containers
movie_containers = soup.select(".flex-container")

# extract movie information
for container in movie_containers:
    # extract title
    title = container.select_one(".p--small").get_text(strip=True)
    
    # extract release date
    release_date = container.select_one(".smaller").get_text(strip=True)
    
    # extract score
    score_element = container.select_one("score-pairs-deprecated")
    
    # Initialize default values for scores
    critics_score = "N/A"
    public_score = "N/A"

    # Extract score element directly from the container
    score_element = container.select_one("score-pairs-deprecated")
    
    if score_element:
        # Extract critics score from rt-text with slot="criticsScore"
        critics_score_element = score_element.select_one('rt-text[slot="criticsScore"]') # slot!!!!!!!!!!!
        if critics_score_element:
            critics_score = critics_score_element.get_text(strip=True)

        # Extract audience score from rt-text with slot="audienceScore"
        audience_score_element = score_element.select_one('rt-text[slot="audienceScore"]')
        if audience_score_element:
            public_score = audience_score_element.get_text(strip=True)
        

    
    print(f"Title: {title}")
    print(f"Release Date: {release_date}")
    # print(score_element)
    print(f"Critics score: {critics_score}, public score {public_score}")
    print("---")


# close the browser
driver.quit()
print("Browser closed successfully.")


accessed rotten tomatoes and content loaded
Scrolling completed. Scrolled 0 times.
Title: Chasing Chasing Amy
Release Date: Streaming Dec 17, 2024
Critics score: 94%, public score N/A
---
Title: Carry-On
Release Date: Streaming Dec 13, 2024
Critics score: 87%, public score 59%
---
Title: Heretic
Release Date: Streaming Dec 10, 2024
Critics score: 91%, public score 76%
---
Title: Juror #2
Release Date: Streaming Dec 3, 2024
Critics score: 94%, public score 91%
---
Title: Conclave
Release Date: Streaming Nov 26, 2024
Critics score: 93%, public score 86%
---
Title: Maria
Release Date: Streaming Dec 11, 2024
Critics score: 74%, public score 70%
---
Title: The Substance
Release Date: Streaming Oct 31, 2024
Critics score: 90%, public score 75%
---
Title: 28 Days Later
Release Date: Streaming May 10, 2014
Critics score: 87%, public score 85%
---
Title: Joy
Release Date: Streaming Nov 22, 2024
Critics score: 90%, public score 89%
---
Title: Emilia Pérez
Release Date: Streaming Nov 13, 2024
Cri

---

# 🌟 Exercise 4 : Scrape and Categorize News Articles from a JavaScript-Enabled News Site
Task:

-    Visit this [website](https://www.bbc.com/innovation/technology).
-    Scrape news article titles and their publication dates.
-    Categorize articles based on their publication month.

Instructions:

-    Use Selenium to navigate to a specific news section on the website.
-    Extract and parse the HTML content that is dynamically loaded via JavaScript.
-    Using BeautifulSoup, extract news article titles and publication dates.
-    Categorize articles by their publication month (e.g., ‘January’, ‘February’, etc.).
-    Print the categorized lists of articles.


In [3]:
# setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")  # Open browser in maximized mode
driver = webdriver.Chrome(options=options)

# navigate to BBC Technology page
url = "https://www.bbc.com/innovation/technology"
driver.get(url)

# wait for the "main-content" element to load
wait = WebDriverWait(driver, 30)

try:
    # wait for the element with ID "main-content" to be present
    wait.until(EC.presence_of_element_located((By.ID, "main-content")))
    print("Successfully accessed 'main-content'.")

    # get the html content of "main-content"
    main_content_html = driver.find_element(By.ID, "main-content").get_attribute('innerHTML')
    print("Retrieved inner HTML of 'main-content'.")

    # parse html content with beautifulsoup
    soup = BeautifulSoup(main_content_html, 'html.parser')
    print("Parsed HTML content with BeautifulSoup.")

    # find ALL articles containerss within the main content area
    articles = soup.find_all('div', attrs={"data-testid": "card-text-wrapper"})

    # make lists to hold titles and last updated dates
    titles = []
    last_updated_dates = []

    
    # I have not failed. I've just found 10,000 ways that won't work
    if articles:
        print(f"retrieved {len(articles)} articles:")
        for article in articles:
            # extract headline
            headline = article.find('h2', attrs={"data-testid": "card-headline"})
            title = headline.get_text(strip=True) if headline else "No Title"
            titles.append(title)

            # extract last updated date
            last_updated = article.find('span', attrs={"data-testid": "card-metadata-lastupdated"})
            last_updated_text = last_updated.get_text(strip=True) if last_updated else "article of now"
            last_updated_dates.append(last_updated_text)

        # make a df with the extracted data
        df_articles = pd.DataFrame({"title": titles, "last updated": last_updated_dates})
        print("data extracted into dataframe: `df_articles`")
        # print(df_articles)  # print results
    else:
        print("NOPE, no articles found.")

except Exception as e:
    print(f"an error occurred: {e}")


# shut the driver
driver.quit()

Successfully accessed 'main-content'.
Retrieved inner HTML of 'main-content'.
Parsed HTML content with BeautifulSoup.
retrieved 41 articles:
data extracted into dataframe: `df_articles`


In [4]:
# now we need to categorize by date
df_articles['date'] = None
current_time = datetime.now()

# Iterate over the 'last updated' column
for index, last_updated in enumerate(df_articles['last updated']):
    # take the last four characters
    year_str = last_updated.strip()[-4:]

    try:
        # try to convert the last four characters to an integer
        year = int(year_str)

        # if successful, check if it's greater than 2000
        if year > 2000:
            # parse the full date string in the format "DD MMM YYYY"
            full_date = datetime.strptime(last_updated.strip(), "%d %b %Y")
            df_articles.at[index, 'date'] = full_date  # Assign to 'date' column
        else:
            df_articles.at[index, 'date'] = "delta"  # Assign "delta" if year is not greater than 2000

    except ValueError:
        if last_updated == 'article of now':  # Check against the variable
            df_articles.at[index, 'date'] = current_time  # Use assignment operator
        else:
            df_articles.at[index, 'date'] = 'delta'  # Use assignment operator

print('I have to make a delta time function then group by month')
print('the delta time funciton comes after a .strip(), to check if its hours or days')
print('this exercise is teking way too long and gonna leave like that for the moment')
display(df_articles)

I have to make a delta time function then group by month
the delta time funciton comes after a .strip(), to check if its hours or days
this exercise is teking way too long and gonna leave like that for the moment


Unnamed: 0,title,last updated,date
0,Why Final Fantasy director almost rejected his...,article of now,2024-12-16 23:46:33.839800
1,Telegram pushes extremist groups to users - study,5 hrs ago,delta
2,Social media given 'last chance' to tackle ill...,10 hrs ago,delta
3,Bitcoin hits new record high of more than $106...,11 hrs ago,delta
4,Hospitals to use AI to cut patient visits,16 hrs ago,delta
5,Why we become better friends as we age,1 day ago,delta
6,Old hospital gowns made into octopus IV drip h...,2 days ago,delta
7,What we've learned since the biggest oil spill...,article of now,2024-12-16 23:46:33.839800
8,What we know about mysterious drones over New ...,1 day ago,delta
9,Why 'digital twins' could speed up drug discovery,4 days ago,delta


# 🌟 Exercise 5 : Scrape and Analyze Weather Data from a JavaScript-Enabled Weather Website
Task:

-    Visit this [website](https://www.accuweather.com/en/us/los-angeles-ca/90012/weather-forecast/348108).
-    Scrape weather forecast data including temperature, condition, and humidity.
-    Analyze the data to find the average temperature and most common weather condition.

Instructions:

-    Use Selenium to navigate to the weather forecast page of a specific city.
-    Extract and parse the HTML content, focusing on dynamically loaded weather data.
-    Using BeautifulSoup, extract relevant weather information like temperature, condition (sunny, cloudy, etc.), and humidity.
-    Calculate the average temperature and identify the most common weather condition.
-    Print the analysis results.


In [5]:
# setup Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")  # Open browser in maximized mode
driver = webdriver.Chrome(options=options)

# navigate to BBC Technology page
url = "https://www.accuweather.com/en/us/attica/30607/weather-forecast/2139413"
driver.get(url)

# wait for the specific weather card element to load
wait = WebDriverWait(driver, 30)

# try:
#     # Wait for the element with the specified class to be present
#     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.template-root div.two-column-page-content div.page-column-1 div.page-content.content-module a.cur-con-weather-card.is-desktop.lbar-panel.content-module")))
#     print("Successfully accessed the weather card body.")

#     # Get the HTML content of the specified element
#     weather_card_body_html = driver.find_element(By.CSS_SELECTOR, "div.template-root div.two-column-page-content div.page-column-1 div.page-content.content-module a.cur-con-weather-card.is-desktop.lbar-panel.content-module").get_attribute('outerHTML')
#     print("Retrieved outer HTML of the weather card body.")

#     # Parse HTML content with BeautifulSoup
#     soup = BeautifulSoup(weather_card_body_html, 'html.parser')
#     print("Parsed HTML content with BeautifulSoup.")

# except Exception as e:
#     print(f"An error occurred: {e}")


try:
    # get current weather
    # wait for the current weather card element to be present
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.template-root div.two-column-page-content div.page-column-1 div.page-content.content-module a.cur-con-weather-card.is-desktop.lbar-panel.content-module")))
    print("Successfully accessed the current weather card body.")

    # get the html content of the current weather card
    weather_card_body_html = driver.find_element(By.CSS_SELECTOR, "div.template-root div.two-column-page-content div.page-column-1 div.page-content.content-module a.cur-con-weather-card.is-desktop.lbar-panel.content-module").get_attribute('outerHTML')
    print("Retrieved outer html of the current weather card body.")

    # parse html content with beautifulsoup for current weather
    soup_current = BeautifulSoup(weather_card_body_html, 'html.parser')
    print("Parsed html content with beautifulsoup for current weather.")


    # get hourly weather
    # wait for the hourly weather list element to be present
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.hourly-list-wrapper.content-module div.hourly-list.has-arrow-right div.hourly-list__list-wrapper div.hourly-list__list")))
    print("Successfully accessed the hourly weather list.")

    # get the html content of the hourly weather list
    hourly_list_html = driver.find_element(By.CSS_SELECTOR, "div.hourly-list-wrapper.content-module div.hourly-list.has-arrow-right div.hourly-list__list-wrapper div.hourly-list__list").get_attribute('outerHTML')
    print("Retrieved outer html of the hourly weather list.")

    # parse html content with beautifulsoup for hourly weather
    soup_hourly = BeautifulSoup(hourly_list_html, 'html.parser')
    print("Parsed html content with beautifulsoup for hourly weather.")

except Exception as e:
    print(f"An error occurred: {e}")


# shut the driver
driver.quit()

Successfully accessed the current weather card body.
Retrieved outer html of the current weather card body.
Parsed html content with beautifulsoup for current weather.
Successfully accessed the hourly weather list.
Retrieved outer html of the hourly weather list.
Parsed html content with beautifulsoup for hourly weather.


After we made one beatiful soup for each element we needed, we get the required data.

First we get current conditions:

In [6]:
# extracting temp
temp_div = soup_current.select_one('div.temp-container > div.temp')
if temp_div:
    # extract temperature value and unit
    temperature = temp_div.get_text(strip=True)
    print(f"Temperature: {temperature}")
else:
    print("Temperature element not found.")

# extract weather condition
condition_span = soup_current.select_one('span.phrase')
if condition_span:
    # Extract weather condition text
    weather_condition = condition_span.get_text(strip=True)
    print(f"Weather Condition: {weather_condition}")
else:
    print("Weather condition element not found.")

Temperature: 13°C
Weather Condition: Mostly cloudy


Then qe get the average temperature:

In [7]:
# print(soup_hourly.prettify())

In [8]:
# initialize list to store the hourly temps
temperature_list = []

# find all span elements with the specified class in soup_hourly
hourly_time_elements = soup_hourly.find_all('span', class_='hourly-list__list__item-time')

# sweep on the hourly time elements
for time_element in hourly_time_elements:
    temperature_list.append(time_element.text.strip())  # Use .strip() to clean up any extra whitespace

# transform string to integer
temperature_list = [int(x) for x in temperature_list]

# calculate avg temp
avg_temp = sum(temperature_list)/len(temperature_list)
print(f'the avgerage temperature is {avg_temp}')

the avgerage temperature is 12.5


Finally we get the most common condition (only icons of conditions):

In [9]:
from statistics import mode

# initialze list to store the icon urls
icon_list = []

# find all img elements based on the specified selector in soup_hourly
hourly_icon_elements = soup_hourly.select('a.hourly-list__list__item > img:nth-child(2)')

# sweep on the horly icons
for icon_element in hourly_icon_elements:
    icon_src = icon_element['src']  # to get the src attribute of the img tag
    icon_list.append(icon_src)  # append url to list

# check list
print(icon_list)
print(f'the most common weather condition is given by the icon: {mode(icon_list)}')


['https://www.awxcdn.com/adc-assets/images/weathericons/6.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/38.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/7.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/7.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/7.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/7.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/7.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/7.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/7.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/7.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/11.svg', 'https://www.awxcdn.com/adc-assets/images/weathericons/11.svg']
the most common weather condition is given by the icon: https://www.awxcdn.com/adc-assets/images/weathericons/7.svg
