# Inference Data Mount

In [141]:
!mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 172.31.91.151:/ ./efs_inference_data

mount: exec /Library/Filesystems/nfs4.fs/Contents/Resources/mount_nfs4 for /Users/purgatorid/Documents/GitHub/Project Canopy/cb_feature_detection/inference/efs_inference_data: No such file or directory


# For Docker Run / Sagemaker

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

# Start Local / Sagemaker Imports

In [1]:
import os
import rasterio as rio
import numpy as np
from rasterio.windows import Window
from glob import glob
from shapely.geometry import Polygon
from shapely.geometry import box
import geopandas as gpd
from rasterio.windows import get_data_window
import rasterio as rio
from inference_predict import *
import boto3
import matplotlib.pyplot as plt
# import gdal
from rasterio.enums import Resampling
from rasterio.vrt import WarpedVRT

# Windowing

In [4]:
def get_windows(img_dim, patch_size=(240, 240), stride=(240, 240)):
    patch_size = np.array(patch_size)
    stride = np.array(stride)
    img_dim = np.array(img_dim)
    # to take into account edges, add additional blocks around right side edge and bottom edge of raster
    new_img_dim = [img_dim[0] + stride[0],img_dim[1] + stride[0]]
    
    max_dim = (new_img_dim//patch_size)*patch_size - patch_size

    ys = np.arange(0, img_dim[0], stride[0])
    xs = np.arange(0, img_dim[1], stride[1])

    tlc = np.array(np.meshgrid(ys, xs)).T.reshape(-1, 2)
    tlc = tlc[tlc[:, 0] <= max_dim[0]]
    tlc = tlc[tlc[:, 1] <= max_dim[1]]
    
    windows = []
    for y,x in tlc.astype(int):
        windows.append(Window(x, y, patch_size[1], patch_size[0]))

    return windows

In [5]:
def add_ndvi(data, dtype_1=rio.float32):
    
    nir = data[3].astype(dtype_1)
    red = data[2].astype(dtype_1)

    # Allow division by zero
    np.seterr(divide='ignore', invalid='ignore')

    # Calculate NDVI
    ndvi = ((nir - red) / (nir + red)).astype(dtype_1)

    # Rescaling for use in 16bit output

    ndvi = (ndvi + 1) * (2**15 - 1)

    # Add NDVI band to end of array    
    rast = np.concatenate((data,[ndvi]),axis=0)
    
    rast = rast.astype(rio.uint16)
    
    return rast


# Download Model Files

In [3]:
model_url = "s3://canopy-production-ml/inference/model_files/model-best.h5"
weights_url = "s3://canopy-production-ml/inference/model_files/model_weights_best.h5"

download_model(model_url,weights_url)

In [6]:
model = load_model("model.h5","model_weights.h5") 

In [7]:
label_list = ["Industrial_agriculture","ISL","Mining","Roads","Shifting_cultivation"]

In [53]:
def output_windows(granule_dir,patch_size=100,
                   stride=100,SAVE=False,SAVE_INDIVIDUAL=False,
                   bands=[2, 3, 4, 8, 11, 12], 
                  model=model,
                   predict_thresh=.5,
                  label_list=label_list, 
                  job_name="test_inference_unwarped", 
                  output_filename="./inference_output/result.json"):
    
    granule_list = glob(f'{granule_dir}/*.tif')
    
    output_dict = {}
    
    granule_id_list = []
    
    window_id_list = []
    
    window_geom_list = []
    
    data_list = []
    
    label_master_list = []
    
    gdf_list = []
    
    timestamp = gen_timestamp()
    
    for j,granule_path in enumerate(granule_list[0:1]):
        
        granule_id = granule_path.split("/")[-1].split("_")[0]
    
        with rio.open(granule_path) as src:
            
            with WarpedVRT(src, crs='EPSG:3257', resampling=Resampling.nearest) as vrt:

                windows = get_windows(vrt.shape, (patch_size, patch_size), (stride, stride))

                for i, window in enumerate(windows):
            
                    print(f"predicting window {i + 1} of {len(windows)} of granulate {j + 1} of {len(granule_list)}",end='\r', flush=True)

                    label_name_list = []

                    window_id = i+1

                    data = vrt.read(bands,window=window, masked=True)

                    data = add_ndvi(data)

                    shape = data.shape

                    new_shape = (data.shape[0],patch_size,patch_size)

                    if shape != new_shape:

                        filled_array = np.full(new_shape, 0)
                        filled_array[:shape[0],:shape[1],:shape[2]] = data
                        data = filled_array
                        window = Window(window.col_off,window.row_off,shape[2],shape[1])


                    #image pre-processing / inference
                    prediction = model.predict(read_image_tf_out(data))
                    prediction = np.where(prediction > predict_thresh, 1, 0)
                    prediction_i = np.where(prediction == 1)[1]
                    for i in prediction_i:
                        label_name_list.append(label_list[i])

                    label_master_list.append(label_name_list)

                    #vectorizing raster bounds for visualization 
                    window_bounds = rio.windows.bounds(window, vrt.transform, height=patch_size, width=patch_size)
                    geom = box(*window_bounds)
                    geom_coords = list(geom.exterior.coords)
    #                 window_geom_list.append(geom)

                    #create or append to dict....

                    if granule_id in output_dict:

                        output_dict[granule_id].append({"window_id":window_id,"polygon_coords":geom_coords,"labels":label_name_list})

                    else:

                        output_dict[granule_id] = [{"window_id":window_id,"polygon_coords":geom_coords,"labels":label_name_list}]
            
            save_to_s3(output_dict,output_filename,job_name,timestamp)



    #             gdf = gpd.GeoDataFrame({"granule_id":granule_id_list,"window_id":window_id_list,"geometry":window_geom_list,"labels":label_master_list})
    #             gdf["labels"] = gdf["labels"].astype(str)

    #             gdf_list.append(gdf)
            
    return output_dict

In [54]:
# granule_dir = "./efs_inference_data/"
granule_dir = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Inference/granule_test(unwarped)"

output_dict = output_windows(granule_dir,output_filename="./inference_output/result-warped.json")

predicting window 1849 of 1849 of granulate 1 of 1

In [43]:
output_dict

{'1241': [{'window_id': 1,
   'polygon_coords': [(-11169700.787427548, -4193553.7119406024),
    (-11169700.787427548, -4191764.5839900365),
    (-11171489.915378114, -4191764.5839900365),
    (-11171489.915378114, -4193553.7119406024),
    (-11169700.787427548, -4193553.7119406024)],
   'labels': []},
  {'window_id': 2,
   'polygon_coords': [(-11167911.659476982, -4193553.7119406024),
    (-11167911.659476982, -4191764.5839900365),
    (-11169700.787427548, -4191764.5839900365),
    (-11169700.787427548, -4193553.7119406024),
    (-11167911.659476982, -4193553.7119406024)],
   'labels': []},
  {'window_id': 3,
   'polygon_coords': [(-11166122.531526417, -4193553.7119406024),
    (-11166122.531526417, -4191764.5839900365),
    (-11167911.659476982, -4191764.5839900365),
    (-11167911.659476982, -4193553.7119406024),
    (-11166122.531526417, -4193553.7119406024)],
   'labels': []},
  {'window_id': 4,
   'polygon_coords': [(-11164333.40357585, -4193553.7119406024),
    (-11164333.40357

In [166]:
data = output_dict

count = {}
label_match_results = []
granule_count = len(data.keys())
granule_list = data.keys()
count["granule_count"] = granule_count
for k1 in list(data.keys()):
    for i in range(len(data[k1])):
        if len(data[k1][i]['labels']) == 0:
            if "null_chips" not in count.keys():
                count["null_chips"] = 1
            else:
                count["null_chips"] += 1 
        for label in data[k1][i]['labels']:
            if label not in count.keys():
                count[label] = 1 
            else:
                    count[label] += 1 
        

In [167]:
count

{'granule_count': 1,
 'null_chips': 1512,
 'Shifting_cultivation': 336,
 'ISL': 82,
 'Roads': 5,
 'Industrial_agriculture': 1}

In [None]:
for i in range (len(output_dict['101'])):
    print(output_dict['101'][i]['labels'])

In [None]:
new_gdf.shape

In [None]:
gdf.plot()

In [None]:
new_gdf.to_file("./inference_output/test.geojson", driver='GeoJSON')

In [None]:
gdf.to_file("./inference_output/test.geojson", driver='GeoJSON')

# Read Output Files

In [2]:
def process_output_files(json_path=None,download=False, filepath = "predict_test-2021-05-10-22-38-41.json", label_match="ISL"):

    s3 = boto3.resource('s3')

    #Download Model, Weights
    
    if download:
        
        bucket = json_path.split("/")[2]
        model_key = "/".join(json_path.split("/")[3:])
        filename = json_path.split("/")[-1]
        s3.Bucket(bucket).download_file(model_key, filename )
        filepath = filename
    
    with open(filepath) as jsonfile:
        data = json.load(jsonfile)
        

    count = {}
    label_match_results = []
    granule_count = len(data.keys())
    granule_list = data.keys()
    count["granule_count"] = granule_count
    for k1 in list(data.keys()):
        for i in range(len(data[k1])):
            if len(data[k1][i]['predicted_labels']) == 0:
                if "null_chips" not in count.keys():
                    count["null_chips"] = 1
                else:
                    count["null_chips"] += 1 
            for label in data[k1][i]['predicted_labels']:
                if label == label_match:
                    label_match_results.append([k1,data[k1][i]])
                if label not in count.keys():
                    count[label] = 1 
                else:
                    count[label] += 1 
    return count, label_match_results, granule_list, data

In [3]:
json_path = "s3://canopy-production-ml/inference/output/predict_3257-2021-05-15-22-18-23.json"

count, match_results, granule_list, data = process_output_files(download=False,
                                                                json_path=json_path,
                                                                filepath="/Users/user/Downloads/inference_output_test-2021-06-25-00-34-24.json")



In [54]:
count

{'granule_count': 97,
 'Shifting_cultivation': 5077,
 'null_chips': 117166,
 'ISL': 3704,
 'Roads': 806,
 'Industrial_agriculture': 1207,
 'Mining': 41}

In [55]:
sum(count.values()) - 97

128001

In [72]:
polygon_list = []
for result in match_results:
    coords = result[1]["polygon_coords"]
    polygon = Polygon(coords)
    polygon_list.append(polygon)

In [73]:
gdf = gpd.GeoDataFrame({"geometry":polygon_list})

In [74]:
gdf = gdf.set_crs(epsg=3257)
        
gdf = gdf.to_crs(epsg=4326)

In [75]:
gdf

Unnamed: 0,geometry
0,"POLYGON ((21.06129 -6.46611, 21.06975 -6.46911..."
1,"POLYGON ((21.00473 -6.59849, 21.01320 -6.60151..."
2,"POLYGON ((20.94185 -6.61425, 20.95032 -6.61727..."
3,"POLYGON ((20.90867 -6.44029, 20.91711 -6.44331..."
4,"POLYGON ((20.88146 -6.51632, 20.88991 -6.51935..."
5,"POLYGON ((20.83208 -6.44141, 20.84052 -6.44444..."


In [76]:
gdf.to_file("./inference_output/test_warped.geojson", driver='GeoJSON')

In [173]:
data['test.tif'][0]["polygon_coords"]

[[-11169700.787427548, -4193553.7119406024],
 [-11169700.787427548, -4191764.5839900365],
 [-11171489.915378114, -4191764.5839900365],
 [-11171489.915378114, -4193553.7119406024],
 [-11169700.787427548, -4193553.7119406024]]

# Get List of Non-Processed Granules

In [44]:
def s3_dir_ls(s3_dir_url):

        objs = []
        bucket = s3_dir_url.split("/")[2]
        key = "/".join(s3_dir_url.split("/")[3:5])

        s3 = boto3.resource('s3')
        my_bucket = s3.Bucket(bucket)


        for obj in my_bucket.objects.filter(Prefix=key):
            objs.append("s3://" + bucket + "/" + obj.key)

        return objs[1:]

s3_dir_url = "s3://canopy-production-ml/full_congo_basin/02.17.21_CB_GEE_Pull/"

all_granules = s3_dir_ls(s3_dir_url)


In [45]:
granule_ids_completed = list(data.keys())

In [46]:
def get_granule_paths(granule_ids_completed,all_granules):
    incomplete_granules = []
    for path in all_granules:
        granule_id = path.split("/")[-1].split("_")[0]
        if granule_id not in granule_ids_completed:
            incomplete_granules.append(path)
    return incomplete_granules
            
    

In [47]:
incomplete_gran_paths = get_granule_paths(granule_ids_completed,all_granules)

In [48]:
with open('incomplete_granules_3257.txt', 'w') as filehandle:
    for listitem in incomplete_gran_paths:
        filehandle.write('%s\n' % listitem)

# Output Vectorized Predicted Granules

In [None]:
def s3_dir_match(s3_dir_url,granule_list):
    

    objs = []
    bucket = s3_dir_url.split("/")[2]
    key = "/".join(s3_dir_url.split("/")[3:5])

    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)

    window_geom_list = []
    granule_id_list = []
    for obj in my_bucket.objects.filter(Prefix=key):
        granule_id = obj.key.split("/")[-1].split("_")[0]
        if granule_id in granule_list:
            obj_url = "s3://" + bucket + "/" + obj.key
            with rio.open(obj_url) as src:
                bounds = src.bounds
                geom = box(*bounds)
                window_geom_list.append(geom)
                granule_id_list.append(granule_id)
    gdf = gpd.GeoDataFrame({"geometry":window_geom_list,"granule_id":granule_id_list})
                

    return gdf

In [None]:
gdf = s3_dir_match("s3://canopy-production-ml/full_congo_basin/02.17.21_CB_GEE_Pull/",granule_list)

In [None]:
gdf

In [None]:
gdf.to_file("granules.json", driver="GeoJSON", index=True)

# Create and Export GDF of Original Labels Data

In [None]:
FILE_NAME = "/Users/purgatorid/Downloads/polygons_021521.csv"


df = pd.read_csv(
    FILE_NAME)
gdf = gpd.GeoDataFrame(
    df,
    crs={'init': 'epsg:4326'})

In [None]:
polygons = []
for polygon in df["polygon"]:
    polygons.append(Polygon(json.loads(polygon)["coordinates"][0]))

In [None]:
gdf["geometry"] = polygons

In [None]:
gdf.loc[90]

In [None]:
gdf.to_file("output.json", driver="GeoJSON", index=True)

# Load and Reproject One Granulate Containing ISL 

In [None]:
def convert_raster(input_file, dest_dir, epsg_format='EPSG:3257', windows=False):
    """Converts the rasters in the src_dir into a different EPSG format,
    keeping the same folder structure and saving them in the dest_dir."""
    
    print(input_file)

    filename = "test.tif"
#     print(filename)

    # If the respective grouping folders are not available 

    output_filepath = dest_dir + filename
    print(output_filepath)


#         Finally, we convert
    converted = gdal.Warp(output_filepath, [input_file],format='GTiff',
                          dstSRS=epsg_format, resampleAlg='near')
    converted = None
        
    print('Finished')

In [None]:
granule = "/Users/purgatorid/Downloads/1241_full_congo_export_v12_all_bands_Feb_11_12_44_53_2021.tif"
dest_dir = "/Users/purgatorid/Downloads/"

convert_raster(granule,dest_dir)

# Visualize Results (Incomplete Code)

In [None]:
def visualize_results(match_results,s3_url):
    for window in match_results:
        granule_id = window[0]
        

In [None]:
t = {1,2,4}

# Running Without Windows Code - Direct Chip Predict 

In [108]:
model = load_model("model.h5","model_weights.h5") 

In [109]:
label_list = ["Industrial_agriculture","ISL","Mining","Roads","Shifting_cultivation"]

In [110]:
def output_predictions(granule_dir=None,patch_size=100,
                   stride=100,SAVE=False,SAVE_INDIVIDUAL=False,
                   bands=[2, 3, 4, 8, 11, 12], 
                  model=model,
                   predict_thresh=.5,
                  label_list=label_list, 
                  job_name="test_inference", 
                  output_filename="./inference_output/result.json", 
                      apply_windows=False, 
                      read_process="read_img_tf_out", 
                      sample_frac=1, 
                      granule_list=None):
    
    if granule_list is None:
        
        granule_list = glob(f'{granule_dir}/*.tif')
    
    end = len(granule_list) // sample_frac 
    
    granule_list = granule_list[0:end]
    
#     print(f"running inference on {len(granule_list)} chips")
    
    output_dict = {}
    
    granule_id_list = []
    
    window_id_list = []
    
    window_geom_list = []
    
    data_list = []
    
    label_master_list = []
    
    gdf_list = []
    
    timestamp = gen_timestamp()
    
    missed_chips = []
    
    for j,granule_path in enumerate(granule_list):
        
        label_name_list = []
        
        granule_id = granule_path.split("/")[-1].split("_")[0]
        filepath = granule_path.split("/")[-1]
        ground_label = granule_path.split("/")[2]
        
        print(f'Running inference on chip {j+1} of {len(granule_list)}',end='\r', flush=True)
        
        if filepath:

            with rio.open(granule_path) as src:

                data = src.read(bands,masked=True)

                data = add_ndvi(data)

                shape = data.shape

                if apply_windows:

                    new_shape = (data.shape[0],patch_size,patch_size)

                    if shape != new_shape:

                        filled_array = np.full(new_shape, 0)
                        filled_array[:shape[0],:shape[1],:shape[2]] = data
                        data = filled_array
                        window = Window(window.col_off,window.row_off,shape[2],shape[1])

                #image pre-processing / inference


                if read_process == "read_img_tf_out":
                    read_func = read_image_tf_out
                else:
                    read_func = read_image

                prediction = model.predict(read_func(data))
#                 print("original_prediction:",prediction)
                prediction = np.where(prediction > predict_thresh, 1, 0)
#                 print("sigmoid prediction gate:",prediction)
                prediction_i = np.where(prediction == 1)[1]
                if 1 not in np.where(prediction == 1)[1]:
                    missed_chips.append(granule_path)
#                 print("index of matching labels:",prediction_i)
                for i in prediction_i:
                    label_name_list.append(label_list[i])

#                 label_master_list.append(label_name_list)

                #vectorizing raster bounds for visualization 
                data_bounds = src.bounds
                geom = box(*data_bounds)
                geom_coords = list(geom.exterior.coords)
    #                 window_geom_list.append(geom)

                #create or append to dict....

                if granule_id in output_dict:

                    output_dict[granule_id].append({"polygon_coords":geom_coords,"predicted_labels":label_name_list,"actual_label":ground_label})

                else:

                    output_dict[granule_id] = [{"polygon_coords":geom_coords,"predicted_labels":label_name_list,"actual_label":ground_label}]
                    
        if j % 1000 == 0: 

            save_to_s3(output_dict,output_filename,job_name,timestamp)



    #             gdf = gpd.GeoDataFrame({"granule_id":granule_id_list,"window_id":window_id_list,"geometry":window_geom_list,"labels":label_master_list})
    #             gdf["labels"] = gdf["labels"].astype(str)

    #             gdf_list.append(gdf)

    return output_dict,missed_chips

In [161]:
# granule_dir = "./efs_inference_data/"
granule_dir_local = "/Volumes/Lacie/zhenyadata/Project_Canopy_Data/PC_Data/Sentinel_Data/Chips/misha_polygons_cloudfreemerge/yes/ISL/100/91/"
# granule_dir_efs = 

output_dict,missed_chips = output_predictions(granule_dir_local)

running inference on 75 chips
running inference on 74 of 75

In [None]:
df = pd.DataFrame({"file_path":missed_chips})

In [None]:
df.to_csv("missed_chips.csv",index=False)

In [162]:
data = output_dict

count = {}
label_match_results = []
granule_count = len(data.keys())
granule_list = data.keys()
count["granule_count"] = granule_count
for k1 in list(data.keys()):
    for i in range(len(data[k1])):
        if len(data[k1][i]['labels']) == 0:
            if "null_chips" not in count.keys():
                count["null_chips"] = 1
            else:
                count["null_chips"] += 1 
        for label in data[k1][i]['labels']:
            if label not in count.keys():
                count[label] = 1 
            else:
                    count[label] += 1 
        

In [163]:
count

{'granule_count': 1,
 'ISL': 43,
 'Shifting_cultivation': 24,
 'Roads': 3,
 'null_chips': 3,
 'Industrial_agriculture': 2}

#### Opening Chips from S3

In [17]:
bucket = 'canopy-production-ml'

def s3_actions(bucket, in_path=None, out_path=None, copy_list=None, copy_dir=False, delete=False, stop_at=None, chip_list=False, chip_count=False):
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    objs = []
    total_files = 0 
    

    
    if copy_dir or delete or chip_list or chip_count:
        for obj in my_bucket.objects.filter(Prefix=in_path):
            total_files += 1 
            objs.append(obj.key)
            if total_files == stop_at:
                break
        
    tot_objs = len(objs)
        
    if delete:
        for index,obj in enumerate(objs,1):
            print(f"deleting {index} of {tot_objs}", end='\r', flush=True)
            obj.delete()
            
    if chip_list:
        return objs
    
    if chip_count:
        return total_files
    
    if copy_list:
        total_copy_list = len(copy_list)

        for index,obj in enumerate(copy_list,1):
            
            print(f"copying {index} of {total_copy_list}", end='\r', flush=True)
            old_key = in_path + obj
            new_key = out_path + obj
            CopySource = {
                'Bucket': bucket,
                'Key':old_key}
            my_bucket.copy(CopySource, new_key)

In [18]:
path = 'chips/cloudfree-merge-polygons/dataset_v2/'

full_chip_list = s3_actions(bucket,in_path=path,chip_list=True)

In [42]:
prepend = "s3://canopy-production-ml/"

full_chip_list = [prepend + i for i in full_chip_list]

In [49]:
for x in full_chip_list:
    if ".csv" in x:
        full_chip_list.remove(x)
        continue

In [50]:
for x in full_chip_list:
    if ".csv" in x:
        print(x)

In [51]:
# s3://canopy-production-ml/chips/cloudfree-merge-polygons/dataset_v2/

# rio.open(f"s3://canopy-production-ml/{full_chip_list[1]}")

#### Running Infernece on All Labeled Data

In [None]:
# chips_dir_list = glob("./efs/*/100/*/*.tif")

In [None]:
output_dict,missed_chips = output_predictions(granule_list=full_chip_list)

Running inference on chip 15042 of 128989

# Histogram for Numpy Array

In [None]:
data1.shape

In [None]:
def np_hist(arr,tensor=True):
    
    if tensor:
        arr = np.array(arr)
        arr = np.transpose(arr[0], (2, 1, 0))


    for i in range(arr.shape[0]):
        band_np = arr[i].flatten()
        plt.hist(band_np,label=str(i))


    plt.legend(prop={'size': 10})
    plt.show()
    

In [None]:
np_hist(data)

In [None]:
np_hist(data1,tensor=False)

# Sandbox

In [15]:
tot_len = 0 
for key in data.keys():
    tot_len += len(data[key])

In [16]:
tot_len

128001

In [56]:
len(full_chip_list)

129004

In [68]:
full_chip_list[1]

'chips/cloudfree-merge-polygons/dataset_v2/ISL/100/1/1_1000_1000.tif'

In [26]:
data["79"]

[{'polygon_coords': [[-11797353.520952346, -3216158.2367175575],
   [-11797353.520952346, -3214318.542869329],
   [-11799193.214800574, -3214318.542869329],
   [-11799193.214800574, -3216158.2367175575],
   [-11797353.520952346, -3216158.2367175575]],
  'predicted_labels': ['Shifting_cultivation'],
  'actual_label': 'ISL'},
 {'polygon_coords': [[-11812071.071738174, -3214318.542869329],
   [-11812071.071738174, -3212478.8490211004],
   [-11813910.765586402, -3212478.8490211004],
   [-11813910.765586402, -3214318.542869329],
   [-11812071.071738174, -3214318.542869329]],
  'predicted_labels': [],
  'actual_label': 'ISL'},
 {'polygon_coords': [[-11795513.827104116, -3214318.542869329],
   [-11795513.827104116, -3212478.8490211004],
   [-11797353.520952344, -3212478.8490211004],
   [-11797353.520952344, -3214318.542869329],
   [-11795513.827104116, -3214318.542869329]],
  'predicted_labels': [],
  'actual_label': 'ISL'},
 {'polygon_coords': [[-11813910.765586402, -3199600.9920834997],
   

In [104]:
def missing_chips(full_chip_list,data):
    
    completed_chips = []
    for key in list(data.keys()):
        for completed_chip in data[key]:
            completed_chip_id = (key,completed_chip["actual_label"])
            if completed_chip_id not in completed_chips:
                completed_chips.append(completed_chip_id)
                
    total_chips = []
    for chip in full_chip_list[1:]:
        try:
            gran_id = chip.split("/")[5]
            actual_label = chip.split("/")[3]
            total_chip_id = (gran_id,actual_label)
            if total_chip_id not in total_chips:
                total_chips.append(total_chip_id)
        except:
            continue
            
    missing_chips = list(set(total_chips) - set(completed_chips))
    chips_to_process = []
    
    
    for missing_chip_id in missing_chips:
        
        for chip in full_chip_list[1:]:
            try:
                gran_id = chip.split("/")[5]
                actual_label = chip.split("/")[3]
                total_chip_id = (gran_id,actual_label)
                if missing_chip_id == total_chip_id:
                    if total_chip_id not in chips_to_process:
                        chips_to_process.append(chip)
            except:
                continue
            
    return chips_to_process,missing_chips

In [105]:
chips_to_process,missing_chips = missing_chips(full_chip_list,data)

In [106]:
missing_chips

[('52', 'Shifting_cultivation'),
 ('92', 'Shifting_cultivation'),
 ('87', 'Shifting_cultivation')]

In [112]:
prepend = "s3://canopy-production-ml/"

missing_chips_list = [prepend + i for i in chips_to_process]

In [113]:
output_dict,missed_chips = output_predictions(granule_list=missing_chips_list)

Running inference on chip 835 of 835