## Usage:

### Dates
Fill in the start and end date in **YYYY-MM-DD** format.

### User_IDS:
**If you are not reading from a file:**  
Put a list of user_ids.
If you want all data, make the list empty (e.g. `user_ids = []`)  

**If you are reading from a file:**   
Look at the cell after for more information.

### Protocol_Nums
Put a list of integers (1-4) of all the desired GLOBE protocols you wish to get.  
Here's how each number corresponds to a protocol:
- 1 -> Mosquito Habitat Mapper
- 2 -> Land Cover
- 3 -> Trees
- 4 -> Cloud

### Picture Options
If you want to download pictures, set **download_pictures** to True.

If downloading pictures is enabled, setting **sort_user** to true will make all photo names start with userID. This will facillitate sorting by userID. If **sort_user** is set to false, it will make all photo names start with the Site Name. 



In [3]:
# YYYY - MM - DD
start_date = "2020-06-01"
end_date = "2020-07-15"

user_ids = ["66638672", "67152678"] # leave empty if you want all records

protocol_nums = [1, 2] # 1 for mosquito, 2 for land cover, 3 for trees, 4 for cloud

download_pictures = True
sort_user = True 

## Warning
Run the following cell only if you want to use a TXT or CSV file for the user ids

In [2]:
import pandas as pd

file_name = "SEES IDs.txt"
separator = ","

ids = pd.read_csv(file_name, sep= separator, header=None)

if "csv" in file_name:
    user_ids = ids.iloc[0].tolist()
else:
    user_ids = (ids[0].tolist())
user_ids

[67207102,
 67152678,
 67178603,
 67135171,
 67436475,
 67153225,
 67255924,
 67160414,
 67164796,
 67544809,
 67213936,
 67236271,
 67160927,
 67297451,
 7205631,
 67162901,
 67176457,
 67159651,
 67238425,
 67151343,
 67155491,
 67185765,
 67151201,
 67145371,
 67151408,
 67563196,
 67238083,
 67152840,
 67177452,
 67175949,
 67153760,
 67181924,
 67131941,
 58819763,
 67152169,
 67150611,
 67174075,
 67218609,
 67405621,
 67227647,
 67257314,
 67162066,
 67152454,
 67120922,
 67174444,
 67463027,
 66638672,
 67124833,
 67150801,
 67176902,
 67217642,
 67151727,
 67224590,
 67163783,
 67464407,
 67223459,
 67203043,
 67166055,
 67176607,
 67275996,
 67305043,
 67153082,
 67108841,
 67056683,
 67153484,
 67347925,
 67275738,
 67152311,
 67378028,
 67221466,
 67152983,
 67175052,
 67151585,
 6715296,
 67156349,
 67160615,
 67179522,
 67224666,
 67165912,
 67151859,
 67172186,
 67181788,
 67055901,
 67216624,
 66899558,
 67107291,
 67150463,
 67133071,
 67224292,
 67153594,
 67199689,
 

In [4]:
import requests
import json
import os
import re
import pandas as pd
protocols = {1 : "mosquito_habitat_mapper", 2 : "land_covers", 3 : "tree_heights", 4 : "sky_conditions"}

def photo_download(data, protocol):
    # Directions for Landcovers and sky conditions
    directions = ["Downward", "East", "North", "South", "West", "Upward"]
    temp_manifest = pd.DataFrame()
    if not os.path.isdir("downloaded_images"):
        os.mkdir("downloaded_images")
    def get_picture(picture_url, file_name):
        downloaded_obj = requests.get(picture_url, allow_redirects=True)
        parent_dir = os.path.join("downloaded_images", file_name)
        with open(parent_dir, "wb") as file:
            file.write(downloaded_obj.content)
    def download_picture(picture_url):
        if "https://" in url:
            photo_id = re.search(r'(?<=\d\d\d\d\/\d\d\/\d\d\/).*(?=\/)', url).group(0)
            file_name = f"{name}-{photo_id}-{protocol_name}.jpg".replace(":","-")
            get_picture(url, file_name)
            temp_dict = {
                            "image_name" : [file_name],
                            "origin" : "GLOBE",
                            "link" : [picture_url],
                            "attribution" : "GLOBE",
                            "license" : "Creative Commons - share adapt attribute"
                        }
            return pd.DataFrame.from_dict(temp_dict)
    
    for feature in data["features"]:
        name = f"{feature['properties'][f'{protocol_name}MeasuredAt']}"
        if sort_user:
            name = f"{feature['properties'][f'{protocol_name}Userid']}-{name}"
        else:
            name = f"{feature['properties']['siteName']}-{name}"

        if protocol_num == 2 or protocol_num == 4:
            for direction in directions:
                url = feature["properties"][f"{protocol_name}{direction}PhotoUrl"]
                temp_manifest = temp_manifest.append(download_picture(url), ignore_index=True)
        else:
            if protocol_num == 1:
                urls = feature["properties"][f"{protocol_name}WaterSourcePhotoUrls"]
            else:
                urls = feature["properties"][f"{protocol_name}TreePhotoUrls"]
            urls = urls.split(";")
            for url in urls:
                temp_manifest = temp_manifest.append(download_picture(url), ignore_index=True)
    return temp_manifest
     

def get_geojson(protocol):
    urls = []
    if user_ids:    
        for user_id in user_ids:
            urls.append(f"https://api.globe.gov/search/v1/measurement/protocol/measureddate/userid/?protocols={protocol}&startdate={start_date}&enddate={end_date}&userid={user_id}&geojson=TRUE&sample=FALSE")
    else:
        urls.append(f"https://api.globe.gov/search/v1/measurement/protocol/measureddate/?protocols={protocol}&startdate={start_date}&enddate={end_date}&geojson=TRUE&sample=FALSE")

    response = requests.get(urls[0])
    data = response.json()

    for url in urls[1:]:
        response = requests.get(url)
        for feature in response.json()["features"]:
            data["features"].append(feature)

    data["features"] = [
        feature for feature in data["features"] if feature["properties"][f"{protocol_name}DataSource"] == "GLOBE Observer App" 
    ]

    for feature in data["features"]:
        try:
            feature["geometry"]["coordinates"][0] = feature["properties"][f"{protocol_name}MeasurementLongitude"]
            feature["geometry"]["coordinates"][1] = feature["properties"][f"{protocol_name}MeasurementLatitude"]
        except KeyError:
            print("skipping") # some entries don't have measured at values

    # write to file
    with open(f'geojson output/{protocol}-{user_ids}_measuredAt.json', 'w') as fp:
        json.dump(data, fp)
    return data

manifest = pd.DataFrame()
for protocol_num in protocol_nums:
    protocol = protocols[protocol_num]
    protocol_name = protocol.replace('_', '')
    data = get_geojson(protocol)
    if download_pictures:
        manifest = manifest.append(photo_download(data, protocol), ignore_index = True)
        manifest.index.name = "subject_id"
        manifest.to_csv("downloaded_images/manifest4zooniverse.csv")     

The following cell zips the downloaded_images folder into downloaded_pictures

In [5]:
import shutil

shutil.make_archive("downloaded_pictures", "zip", "downloaded_images")
shutil.rmtree("downloaded_images")