# Web Scrapping - Global Forest Watch data
- We create the output folder in case it does not exists in the "Downloads folder"
- This script iterates over the dataset repository
- Gets the link headers "xxxx.tif"
- Since they don't contain all the strcuture of the sentence, we build the input link with the link header
- Pointing to the download created folder we download them one by one

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re

In [10]:
# URL of the website to scrape
url = "https://storage.googleapis.com/earthenginepartners-hansen/GFC-2022-v1.10/treecover2000.txt"
# Send a GET request to the URL
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links in the webpage
links = soup.find_all('a')

In [5]:
# Create a folder inside "Downloads" to store downloaded files
def set_output_folder(output_folder_name):
    output_admin_folder = "Downloads"
    download_folder = os.path.join(os.path.expanduser('~'), output_admin_folder, output_folder_name)
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    return download_folder, print("Created the folder: " + output_folder_name)

# Get the file links
def make_raster_link_list(links):
    web_file_list = []
    for link in links:
        href = link.get('href') # the name of the file
        if href and href.endswith('.tif') and href.startswith('treecover'):
            web_file_list.append(href)
    print(len(web_file_list))
    return web_file_list

def download_file(file_url, file_location):
    retries = 3
    while retries > 0:
        failed_layers = []
        try:
            # Here we open the link and download the file
            with open(file_location, 'wb') as f:
                response = requests.get(file_url)
                f.write(response.content)
            break
        except Exception as e:
            print(f"Download failed for {file_location}. Retrying...")
            retries -= 1
            if retries == 0:
                print(f"Failed to download {file_location} after 3 attempts. Error: {str(e)}")
                failed_layers.append(file_location)
                break
    if failed_layers: # If there are failed layers, show them
        print("the next files were not downloaded")
        for file in failed_layers:
            print(file)

In [None]:
output_folder_name = "global_forest_watch_data"
# Create the folder
download_folder = set_output_folder(output_folder_name)

if not links:
     # The input is expected to  be a set of links
     # Extract tifs using regular expressions
     links = re.findall(r'(https?://[^\s]+\.(?:tif|tiff))', response.text) # s?: 's' character optional due to the ? quantifier. / [^\s]: Any character except whitespace (\s) / + quantifier ensures that there is at least one non-whitespace character.
     for tif_url in links:
          filename = os.path.basename(tif_url)
          file_location = os.path.join(download_folder, filename) # location of the download
          tif_response = requests.get(tif_url)
          print("Downloading: {}".format(filename))
          download_file(tif_url, file_location)
else:
     # Download each treecover*.tif file into the created folder
    for link in links:
        href = link.get('href')  # the name of the file "kjdfnfosdjf.tif"
        if href and href.endswith('.tif') and href.startswith('treecover'):
            file_location = os.path.join(download_folder, href) # location of the download
            file_url = url + href  # We build the sentence
            print("Downloading: ", file_location)
            download_file(file_url, file_location)

print("Download completed!")

In [None]:
output_folder_name = "global_forest_watch_data"
# Create the folder
download_folder = set_output_folder(output_folder_name)
# Download each treecover*.tif file into the created folder
for link in links:
    href = link.get('href')  # the name of the file "kjdfnfosdjf.tif"
    if href and href.endswith('.tif') and href.startswith('treecover'):
        file_name = os.path.join(download_folder, href)
        file_url = url + href  # We build the sentence
        print("Downloading: ", file_name)
        download_file(file_url, file_name)

print("Download completed!")

In [17]:
"""Check for missing files"""
web_file_list = make_raster_link_list(links)
# List all files in the download folder
local_file_list = os.listdir(download_folder)

# Convert both lists to sets for efficient comparison. Set eveything to minus, the files are different in naming.
downloaded_files_set = set(map(str.lower, local_file_list))
website_files_set = set(map(str.lower, web_file_list))

# Find missing files by taking the difference between the sets
missing_files = website_files_set - downloaded_files_set

if missing_files:
    print("The following files are missing:")
    for file in missing_files:
        print(file)

In [16]:
"""Download missing files"""
for missing_file in missing_files:
    layer_count = 1
    file_url = url + missing_file
    file_name = os.path.join(download_folder, missing_file)
    print("Downloading: ", file_name, "{} out of {}".format(layer_count, len(missing_files)))
    download_file(file_url, file_name)
    layer_count += 1
    
print("Missing files downloaded successfully!")

The following files are missing:
treecover2010_20n_120e.tif
treecover2010_20s_010e.tif
treecover2010_20n_110w.tif
treecover2010_20n_110e.tif
treecover2010_20n_160w.tif
Missing files downloaded successfully!
