In [8]:
import requests
import pandas as pd
import numpy as np
import re
import os
from PIL import Image, ImageDraw, ImageFont
from io import BytesIO

out_directory = "OutImages"

In [9]:
if not os.path.exists(out_directory):
    os.makedirs(out_directory)

In [3]:
df = pd.read_csv("FrontiersData.csv")
df.columns = [col_name.replace(" ", "") for col_name in df.columns]

df = df.rename({"meaureLongitude" : "measureLongitude"}, axis = 1)
df

Unnamed: 0,landCoverId,dataSource,siteName,site_id,measureLatitude,measureLongitude,measureElevation,SiteLatitude,SiteLongitude,SiteElevation,...,MCD12Q1_006_LC_Type4,MCD12Q1_006_LC_Type4_label,MCD12Q1_006_LC_Type5,MCD12Q1_006_LC_Type5_label,MCD12Q1_006_LW,MCD12Q1_006_LW_label,MCD12Q1_006_QC,MCD12Q1_006_QC_bitmask,MCD12Q1_006_QC_Name,MCD12Q1_006_QC_Name_Description
0,26205,GLOBE Observer App,32SNK584133,164098,39.8681,9.6829,5.7,39.867732,9.682845,7.1,...,6,Annual Broadleaf Vegetation,6,Grass,2,Land,0,0b00000000,0b00000000,Classified land (Has a classification label an...
1,27886,GLOBE Observer App,11SLT956789,47468,34.1466,-118.1323,250.5,34.145788,-118.132419,248.2,...,8,Urban and Built-up Lands,9,Urban and Built-up Lands,2,Land,0,0b00000000,0b00000000,Classified land (Has a classification label an...
2,27897,GLOBE Observer App,18SUH671867,171620,38.7190,-76.5276,1.8,38.718968,-76.528719,1.5,...,4,Deciduous Broadleaf Vegetation,4,Deciduous Broadleaf Trees,2,Land,0,0b00000000,0b00000000,Classified land (Has a classification label an...
3,27179,GLOBE Observer App,18NWH130983,165981,2.6992,-74.8826,1248.4,2.698788,-74.883044,1251.2,...,2,Evergreen Broadleaf Vegetation,2,Evergreen Broadleaf Trees,2,Land,0,0b00000000,0b00000000,Classified land (Has a classification label an...
4,25962,GLOBE Observer App,11SMT309761,139902,34.1239,-117.7493,467.1,34.123473,-117.749336,467.6,...,8,Urban and Built-up Lands,9,Urban and Built-up Lands,2,Land,9,0b00001001,0b00001001,Forest type changed (Climate-based change to f...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3347,27839,GLOBE Observer App,34UEA466932,170969,50.4892,21.6577,150.6,50.488925,21.656918,153.7,...,4,Deciduous Broadleaf Vegetation,4,Deciduous Broadleaf Trees,2,Land,0,0b00000000,0b00000000,Classified land (Has a classification label an...
3348,27873,GLOBE Observer App,34UEA481984,171364,50.5357,21.6796,158.9,50.535565,21.678732,158.8,...,5,Annual Broadleaf Vegetation,8,Broadleaf Croplands,2,Land,0,0b00000000,0b00000000,Classified land (Has a classification label an...
3349,28021,GLOBE Observer App,34UEA471929,172992,50.4868,21.6645,147.2,50.486187,21.663928,147.0,...,6,Annual Broadleaf Vegetation,6,Grass,2,Land,0,0b00000000,0b00000000,Classified land (Has a classification label an...
3350,28031,GLOBE Observer App,34UEA461926,172988,50.4840,21.6509,147.0,50.483569,21.649796,147.0,...,5,Annual Broadleaf Vegetation,8,Broadleaf Croplands,2,Land,0,0b00000000,0b00000000,Classified land (Has a classification label an...


In [19]:
import re
direction_map = {
    "up" : 1, "down" : 2, "north" : 3, "east" : 4, "south" : 5,"west" : 6
}
surface_conditions = [column for column in df.columns if "sc_" in column]
url_entries = [column for column in df.columns if "URL" in column]

def get_photos(df):
    """Gets the images from a Landcover DataFrame and adds their file names and resolutions to the DataFrame
    
    The file names follow this convention:
    GOLC_[measureLatitude]_[measureLongitude]_[siteName]_[SiteElevation]m_[yyyymmdd]_[landCoverId]_{directionNumber}
    [direction]_[MCD12Q1_006_LW_label]_[muc_code]_IGBP[MCD12Q1_006_LC_Type1][MCD12Q1_006_LC_Type1_label]_
    {surfaceConditions=True}_[locationMethod][accuracy_m]m_ widthXheight.jpg
    
    Most of the fields in square or curly brackets are fields in the DataFrame.
    surfaceConditions=True: All SurfaceCondition entries that are true for that particular row in the dataset.
    directionNumber: An integer representing the direction (Up: 1, Down: 2, North: 3, East: 4, South: 5, West: 6)
    direction: The direction the photo was taken (North, South, East, West, Up, Down)
    
    Arguments:
        df: pd.DataFrame, a Pandas DataFrame Containing GLOBE Observer Landcover Data from the GLOBE API
    
    Returns:
        pd.DataFrame, an updated Landcover DataFrame with columns for image resolutions in all image directions and downloaded file names.
    """
    
    # helper function
    def get_entry_photos(row):
        """Adds columns to a Landcover DataFrame row containing image resolution and downloaded photo filenames.
        
        Arguments:
            row: pd.Series, A Pandas DataFrame Row Containing GLOBE Observer Landcover Data from the GLOBE API
        
        Returns:
            pd.Series, An updated row with columns for image resolutions and filenames for all image directions
        """
        
        def get_photo(row, entry, name = ""):
            """Downloads landcover photos and adds columns to Landcover DataFrame Row containing image resolution and file names.
            
            Arguments:
                row: pd.Series, A Pandas DataFrame Row Containing GLOBE Observer Landcover Data from the GLOBE API
                entry: str, the column key for the url entry of the row
                name: str, the name of the column following the aforementioned standard without the resolution information.
            
            Returns:
                pd.Series, An updated row with columns for the specified direction
            """
            
            if row[entry] and row[entry] == row[entry] and "https" in row[entry]:
                    # Download  Photo
                    response = requests.get(row[entry], stream=True )
                    photo = Image.open(BytesIO(response.content)) 
                    
                    # Get resolution
                    width, height = photo.size
                    name += f"{width}x{height}.jpg"
                    
                    # Remove Bad Characters from file name
                    name = re.sub(r"[<>:?\"|*]", "", name)
                    
                    # Update row
                    row[entry.replace("URL", "PhotoFileName")] = name
                    row[entry.replace("URL", "imgXdim")] = width
                    row[entry.replace("URL", "imgYdim")] = height
                    
                    # Save photo
                    photo.save(f"{out_directory}/{name}")
                    
            return row
        def pad_zeroes(text, length, trail = False, value = "0"):
            """ Pads a string with a character to have it at a fixed length
            
            Parameters
            ----------
            text: str, the string to be padded
            length: int, the desired length
            trail: bool, if the zeroes should trail the string
            value: int, the value of the character that will be padding the string
            
            Returns
            -------
            str
                A string with the padding.
            """
            
            # string conversion
            text = str(text)
            
            # Gets how many characters are needed
            diff = length - len(text)
            
            # Generates string
            if diff > 0:
                if trail:
                    return text + (value * diff)
                return value * diff + text
            return text
        def unpack_list(target_list):
            """Unpacks a list of strings
            
            Parameters
            ----------
            target_list: list, a list of strings
            
            Returns
            -------
            str
                A string that returns the list
            """
            
            return "".join([s for s in target_list])
        
        def camel_case(string, delimiter = " "):
            """Converts a string into camel case
            
            Parameters
            ----------
            string: str, the string to convert
            delimiter: str, the character that denotes separate words
            """
            
            str_list = [s[0].upper() + s[1:] for s in string.split(delimiter)]
            return unpack_list(str_list)
        
        # Get necessary name values from rows
        measured_latitude = row["measureLatitude"]
        measured_longitude = row["measureLongitude"]
        
        
        lat_int, lat_dec = str(abs(round(measured_latitude, 5))).split(".")
        measured_latitude = ("n" if measured_latitude >= 0 else "s") + pad_zeroes(lat_int, 2) + "p" + pad_zeroes(lat_dec, 5, True)
        
        long_int, long_dec = str(abs(round(measured_longitude, 5))).split(".")
        measured_longitude = ("e" if measured_longitude >= 0 else "w") + pad_zeroes(long_int, 3) + "p" + pad_zeroes(long_dec, 5, True)
        
        site_elevation = pad_zeroes(str(round(row["SiteElevation"])), 4)
        month, day, year = row["measureDate"].split("/")
        year = f"20{year}"
        day = pad_zeroes(day, 2)
        month = pad_zeroes(month, 2)
        date = year + month + day
        site_name = row["siteName"]
        landcover_id = pad_zeroes(row["landCoverId"], 8)
        label = pad_zeroes(row["MCD12Q1_006_LW_label"], 5, True, "1")
        muc = row["muc_code"]
        if muc and muc == muc:
            muc = pad_zeroes(muc, 3, True)
        else:
            muc = "xxx"
            
        labels = camel_case(row["MCD12Q1_006_LC_Type1_label"])
        labels = camel_case(labels, ",")
        labels = camel_case(labels, "-")
        labels = camel_case(labels, "/")
                                    
        type_1 = pad_zeroes(row["MCD12Q1_006_LC_Type1"], 2)

        sc = [column.replace("sc_", "") for column in surface_conditions if row[column]]
        sc_name = unpack_list(sc)
            
        location_method = row["locationMethod"]
        if not location_method:
            location_method = "manual"
        accuracy = row["accuracy_m"]
        if not accuracy or accuracy != accuracy:
            accuracy = "_"
        else:
            accuracy = round(accuracy)
            
        # runs the download for each direction
        for entry in url_entries:
            direction = entry.replace("URL", "")
            direction_num = direction_map[direction]
            direction = pad_zeroes(direction.capitalize(), 5, True)
            name = f"GOLC_{measured_latitude}_{measured_longitude}_{site_name}_{site_elevation}m_{date}_{landcover_id}_{direction_num}{direction}_{label}_{muc}_IGBP{type_1}{labels}_{sc_name}_{location_method}{accuracy}m_"
            try:
                row = get_photo(row, entry, name)
            except Exception as e:
                print(f"{row[entry]} failed, retrying...")
                try:
                    row = get_photo(row, entry, name)
                    print("retry succeded")
                except Exception as e:
                    print(f"{row[entry]} failed: {repr(e)}")
        return row
    
    
    # Add in columns into DataFrame with default null value
    for entry in url_entries:
        df[entry.replace("URL", "PhotoFileName")] = np.nan
        df[entry.replace("URL", "imgXdim")] = np.nan
        df[entry.replace("URL", "imgYdim")] = np.nan
    
    
    # Apply helper function to all DataFrame Rows
    return df.apply(get_entry_photos, axis=1)




In [None]:
%time df = get_photos(df)

In [None]:
df.to_csv("full_run.csv")