# Process: Scrape evr2est Web Page for JP2 Files

Organization: Esri

Author: Alberto Nieto (anieto@esri.com)

Date: 9/18/2018

Goal: Use the webpage at https://evr2est.us/mission/hurricaneflorence to download all the j2 files for Hurricane Florence imagery 

## Pseudocode

- Set reference paths for webpage and target directory

- Set list of all image links 'view scene page' urls

- Iterate on 'view scene page' list:

    - Open URL
    - Download JP2000 file
    - Save in target directory

## Set needed modules

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import arcgis
import sys
import os

## Set helper functions

In [None]:
def get_list_of_ev2est_urls(root_html):
    html = requests.get(webpage_url)
    soup = BeautifulSoup(html.content, 'html.parser')
    tablerecords = list(soup.find_all('a', title="Scene Page"))
    urllist = list(r"https://evr2est.us"+i['href'] for i in tablerecords)
    return urllist

In [None]:
def download_jp2000_file(suburl, target_dir):
    subhtml = requests.get(suburl)
    subsoup = BeautifulSoup(subhtml.content, 'html.parser')
    jp2tag = subsoup.find("a", href=lambda href: href and ".jp2" in href)
    filename = jp2tag['href'].split('/')[-1]
    if os.path.isfile(target_dir + "//" + filename):
        print("File already present in target directory. Skipping...")
    else:
        r = requests.get(jp2tag['href'], allow_redirects=True)
        open(target_dir + "//" + filename, 'wb').write(r.content)
    
    return target_dir + "//" + filename

In [None]:
def main_iteration(webpage_url, target_dir):
    print("Getting list of image urls...")
    urls_list = get_list_of_ev2est_urls(webpage_url)
    print("List gathered. Starting iteration...")
    for i, url in enumerate(urls_list):
        print("Downloading {0} of {1}...".format(str(i+1), str(len(urls_list))))
        try:
            download_jp2000_file(url, target_dir)
        except TypeError:
            print('>>>> Error opening %s:' % (url))

## Run Main

In [None]:
webpage_url = r"https://evr2est.us/mission/hurricaneflorence"

In [None]:
target_dir = r"D:\3_Sandbox_Projects\1809_HurricaneTransportationAssessment\Hurricane Florence Damage Assessment\Data\evr2est"

In [None]:
main_iteration(webpage_url, target_dir)

## Prototyping section - section below was used for testing. 

View this only if you need to see the discrete steps for this sequence executed and tested. 

In [None]:
html = requests.get(webpage_url)

In [None]:
soup = BeautifulSoup(html.content, 'html.parser')
soup

In [None]:
tablerecords = list(soup.find_all('a', title="Scene Page"))
tablerecords

In [None]:
urls = list(r"https://evr2est.us"+i['href'] for i in tablerecords)
urls

In [None]:
tablerecords[0]['href']

In [None]:
sublink_url = r"https://evr2est.us" + tablerecords[0]['href']
sublink_url

In [None]:
subhtml = requests.get(sublink_url)
subhtml.content

In [None]:
subsoup = BeautifulSoup(subhtml.content, 'html.parser')
subsoup

In [None]:
recs = list(subsoup.find_all('a'))
recs

In [None]:
jp2tag = subsoup.find("a", href=lambda href: href and ".jp2" in href)
jp2tag

In [None]:
jp2tag['href']

In [None]:
filename = jp2tag['href'].split('/')[-1]
filename

In [None]:
target_dir

In [None]:
r = requests.get(jp2tag['href'], allow_redirects=True)
open(target_dir + "//" + filename, 'wb').write(r.content)