## Nasa Satellite images:

Suppose we want to build a Computer vision dataset that involves satellite images.

Your tasks are the following:
- Collect satellite images from  https://earthobservatory.nasa.gov/images
 - Make sure to render the whole page using selenium and then use BeautifulSoup to scrape the data.
 - Create a repo and name it Images, Save the crowled images based on their titles. 
 - Create a dictionary where the keys are the images/titles and the values are the images’ descriptions.

In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from time import sleep

from bs4 import BeautifulSoup
import json

In [2]:
#webdriver.Chrome - To open a chrome window
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://earthobservatory.nasa.gov/images")

In [7]:
Number_of_clicks = 4 # to click the "Explore More" button
while Number_of_clicks > 0:
    Number_of_clicks -= 1
    explore_more_button = driver.find_element(By.XPATH, '//*[@class="explore-more"]')
    explore_more_button.click()
    sleep(2)  # Add a delay of 2 seconds to allow the content to load

In [8]:
page_source = driver.page_source

In [14]:
web_data = BeautifulSoup(page_source, "html.parser")
web_data

<html lang="en"><!--<![endif]--><head>
<!-- Google tag (gtag.js) -->
<script async="" id="www-widgetapi-script" src="https://www.youtube.com/s/player/bbe1b497/www-widgetapi.vflset/www-widgetapi.js" type="text/javascript"></script><script src="https://www.youtube.com/iframe_api"></script><script async="" src="https://www.google-analytics.com/analytics.js"></script><script async="" src="https://ssl.google-analytics.com/ga.js" type="text/javascript"></script><script async="" src="https://www.googletagmanager.com/gtag/js?id=G-KEN9L99MMC"></script>
<script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());

      gtag('config', 'G-KEN9L99MMC');
    </script>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Images</title>
<!-- NASA required metadata -->
<meta content="610" name="orgcode"/>
<meta content="Robe

In [17]:
Nasa_images = {}
image_links = web_data.find_all("div", class_="thumbnail-image")
captions = web_data.find_all("div", class_="caption")


In [19]:
for link, caption in zip(image_links, captions):
    
    # Get the download link, description, and title
    download_link = link.a.img["src"]
    description = caption.p.text
    title = caption.h4.a.text
    
    # Store image information in the dictionary
    Nasa_images[title] = {}
    Nasa_images[title] = {
        "Image": download_link,
        "Description": description
    }


In [23]:
# Save the image information dictionary as a JSON file
with open('Nasa_images_data.json', 'w') as f:
    json.dump(Nasa_images, f, indent = 4)

### IMDB:

In [None]:
import requests
import os

target = 10001
page = list(range(1, target, 50))
celebrities = {}
categories = ["female", "male"]

for category in categories:
    # Loop through the pages
    for i in page:
        # Get the HTML content of the page
        url = f"https://www.imdb.com/search/name/?gender={category}&start={i}&ref_=rlm"
        html_text = requests.get(url).text
        soup = BeautifulSoup(html_text, "html.parser")

        # Extract the information for each celebrity
        names = soup.find_all("h3", class_="lister-item-header")
        images = soup.find_all("img")
        infos = soup.find_all("div", class_="lister-item-content")

        for name, link, info in zip(names, images, infos):
            # Get the name and image link
            name = name.a.text.strip()
            link = link["src"]

            # Store the image link in the celebrities dictionary
            if category not in celebrities:
                celebrities[category] = {}
            if name not in celebrities[category]:
                celebrities[category][name] = {}
                celebrities[category][name]["Image"] = link

            # Get the role and movie information
            info_element = info.find("p", class_="text-muted text-small")
            info_text = info_element.text.strip() if info_element else ""
            role, movie = info_text.split("|") if "|" in info_text else (info_text, "")
            celebrities[category][name]["Role"] = role.strip()
            celebrities[category][name]["Movie"] = movie.strip()

            # Get the additional history information
            info_paragraphs = info.find_all("p")
            history_text = info_paragraphs[1].text.strip() if len(info_paragraphs) > 1 else ""
            celebrities[category][name]["History"] = history_text

# Save the celebrities dictionary as a JSON file
if os.path.exists('IMDB_data.json'):
    with open('IMDB_data.json', 'a') as f:
        json.dump(celebrities, f)
else:
    with open('IMDB_data.json', 'w') as f:
        json.dump(celebrities, f)