In [1]:
import pandas as pd
import datetime 
import time
import numpy as np
from matplotlib import pyplot as plt, rcParams
from datetime import datetime as dt

import brtdevkit
from brtdevkit.core.db import DBConnector, DatetimeFilter
from brtdevkit.core.db.db_filters import *  # We need this for pre-defined filters, e.g., ProjectFilter, DatetimeFilter

# Function to query Aletheia for an image DF

def get_shasta_data(filters={}, start=None, end=None, limit=None):
    """
    Query Shasta metadata matching filters and date.
    """
    start_time = time.time()
    connector = DBConnector()
    img_filters = {'project_name': 'shasta', **filters}
    if start is not None or end is not None:
        img_filters = [img_filters, DatetimeFilter(key="collected_on", start=start, end=end)]
    df = connector.get_documents_df('image', img_filters, limit=limit)
    elapsed_time = time.time() - start_time
    return df, elapsed_time

In [2]:
dcms_2021 = ['DCM11', 'DCM12', 'DCM13', 'DCM14','DCM16', 'DCM17','DCM18', 'DCM19', 'DCM20', 'DCM21','DCM22','DCM23','DCM24','DCM25', 'DCM26', 'DCM27', 'DCM28']
dbs_2021 = ['DB1', 'DB2', 'DB3', 'DB4', 'DB5', 'DB6', 'DB7', 'DB8', 'DB9', 'db1', 'db2', 'db3', 'db4', 'db5', 'db6', 'db7', 'db8', 'db9']
# Set start date for query
start = dt(2021, 3 , 29)
end = dt(2021, 7, 25)

# Select filters
filters = { "artifacts.kind": "nrg",  
           'has_human_annotation' : True,
           'robot_name' : {"$in": dcms_2021}
          }

full_df, elapsed_time = get_shasta_data(filters=filters, start = start, end = end)
full_df['date_collected'] = pd.to_datetime(full_df['collected_on']).dt.date
full_df['grower_farm_field'] = full_df['grower'] +'_' + full_df['farm'] + '_' + full_df['operating_field_name']
print(f"Queried {len(full_df)} images in {elapsed_time:.2f} s.")

Queried 50637 images in 112.14 s.


In [3]:
# Load image lists that have already been submitted

import os

cwd = os.getcwd()
if os.getcwd() != '/home/williamroberts/code/brtdevkit/Projects/Utilities/Image selection/2021_image_lists':
    os.chdir('/home/williamroberts/code/brtdevkit/Projects/Utilities/Image selection/2021_image_lists')
image_lists = os.listdir(os.getcwd())
    
master_list = pd.read_csv(image_lists[0])
print(len(master_list))
print(len(image_lists))

for csv in image_lists[1:]:
    if csv.endswith('.csv'):
        next_list = pd.read_csv(csv)
        master_list = pd.concat([master_list, next_list], axis=0)
        print(len(master_list))
        
print(len(master_list))
os.chdir(cwd)

250
17
5246
5465
9591
35322
57991
64788
79854
82852
85585
89893
98170
120307
124732
130554
132714
132714


In [15]:
master_list.groupby(['grower_farm_field', 'crop_name']).size().groupby('crop_name').count()

crop_name
CORN        363
COTTON      299
OTHER         1
SOYBEANS    357
dtype: int64

In [16]:
master_list.crop_name.value_counts()

CORN        47851
SOYBEANS    44536
COTTON      39988
OTHER         339
Name: crop_name, dtype: int64

In [7]:
master_list[master_list.grower_farm_field == "josh baile_bailey farms_Jill's"]

Unnamed: 0,_id,grower,farm,operating_field_name,crop_name,experiment_id,grower_farm_field,high_res_aligned
3202,60ba55764f859b25d118a486,,,,COTTON,Caprure 6,josh baile_bailey farms_Jill's,True
3205,60ba532331ba107f7353db82,,,,COTTON,Caprure 5,josh baile_bailey farms_Jill's,True
3224,60ba55f54f859b25d118a630,,,,COTTON,Caprure 6,josh baile_bailey farms_Jill's,True
3231,60ba4d1040a60519f21e47ff,,,,COTTON,Caprure 9,josh baile_bailey farms_Jill's,True
3254,60ba519b66388bdcb31d6f26,,,,COTTON,Caprure 5,josh baile_bailey farms_Jill's,True
...,...,...,...,...,...,...,...,...
4488,60ba57d99d67582bedb23a0e,,,,COTTON,Caprure 5,josh baile_bailey farms_Jill's,True
4491,60ba5dcba9914ac25933a5fb,,,,COTTON,Caprure 7,josh baile_bailey farms_Jill's,True
4499,60ba52c340a60519f21e5526,,,,COTTON,Caprure 5,josh baile_bailey farms_Jill's,True
4513,60ba618c8f94910a43ccb355,,,,COTTON,Caprure 5,josh baile_bailey farms_Jill's,True


In [4]:
sent_out = master_list
full_df.index.name = None # prevents the index and the _id column having the same label, which throws off the merge
annotated = master_list.merge(full_df[full_df['_id'].isin(master_list['_id'])], on='_id', how='left')[['_id', 'grower_farm_field_x','has_human_annotation', 'crop_name_x']]
print(f'{len(annotated)} images have been submitted to date.')
annotation_counts = annotated.groupby(['crop_name_x','has_human_annotation']).size()
annotation_counts

132714 images have been submitted to date.


crop_name_x  has_human_annotation
CORN         True                    14984
COTTON       True                    16745
OTHER        True                      339
SOYBEANS     True                    14062
dtype: int64

In [4]:
# Flatten Annotations and Examine States

def flatten_annotations(dataframe):
    r = pd.DataFrame(
        {
            col:np.repeat(dataframe[col].values, dataframe['annotations'].str.len())
            for col in dataframe.columns.drop('annotations')
        }).assign(**{'annotations':np.concatenate(dataframe['annotations'].values)})[dataframe.columns]
    w = pd.concat([
        r.drop(['annotations'], axis=1),
        r['annotations'].apply(pd.Series).add_suffix('_annotations')
    ],
    axis=1)
    # Return filtered dataframe with each annotation artifact as a single row. 
    # Filter out ndvi and machine labels 
    return w[(w['kind_annotations'] != 'machine') & (w['kind_annotations'] != 'ndvi_mask')]

flat = flatten_annotations(full_df)
print(f'There are {len(flat)} total annotations from {len(flat._id.unique())} unique image ids')

There are 166300 total annotations from 50637 unique image ids


In [5]:
flat.state_annotations.value_counts()

review      98265
ok          64905
relabel      2383
skipped       689
labeling       58
Name: state_annotations, dtype: int64

In [6]:
rev = flat[(flat['state_annotations']=='review') & (flat.next_version_annotations.isnull()==True)].drop_duplicates(subset='_id', keep='first').groupby(['crop_name']).size()
rel = flat[(flat['state_annotations']=='relabel') & (flat.next_version_annotations.isnull()==True)].drop_duplicates(subset='_id', keep='first').groupby(['crop_name']).size()
ok = flat[(flat['state_annotations']=='ok') & (flat.next_version_annotations.isnull()==True)].drop_duplicates(subset='_id', keep='first').groupby(['crop_name']).size()
print(rev)
print(rel)
print(ok)

status = pd.concat([rev,rel, ok], axis=1).reset_index()
status.columns = ['Crop','Review', 'Relabel', 'OK']
status


crop_name
CORN        158
COTTON      393
OTHER       232
SOYBEANS    705
dtype: int64
crop_name
CORN          3
COTTON      102
SOYBEANS    239
dtype: int64
crop_name
CORN        14947
COTTON      16214
OTHER         338
SOYBEANS    17383
dtype: int64


Unnamed: 0,Crop,Review,Relabel,OK
0,CORN,158,3.0,14947
1,COTTON,393,102.0,16214
2,OTHER,232,,338
3,SOYBEANS,705,239.0,17383


In [34]:
bs = list(ok._id_annotations.astype(str))
bs[0:5]

['60e2bcf96f61839e329d04c8',
 '60e5fb15d69532c5a4d631b3',
 '60df4511efe769da459cd2ea',
 '60df4511efe769da459cd2e2',
 '60e5e4e697a81202bacbeed9']

In [35]:
from brtdevkit.data import Dataset

# Enter a name and description
dataset_name= '070721_Soy_Boom_Shadow'
description = 'Soy dataset of 2021 images with boom shadows.'

Dataset.create(
    name= dataset_name,
    description = description,
    kind = Dataset.KIND_ANNOTATION,
    annotation_ids = bs
    )

{}

In [19]:
flat.state_annotations.value_counts()

review      54507
ok          24523
relabel      1026
skipped       252
labeling       45
Name: state_annotations, dtype: int64

In [4]:
flat = flat[flat.next_version_annotations.isnull()==True]
flat = flat.sort_values('created_at_annotations', ascending=False)
flat = flat.drop_duplicates(subset='_id', keep='last')  
flat.state_annotations.value_counts()

ok          24028
review       8504
relabel       406
skipped       230
labeling       14
Name: state_annotations, dtype: int64

In [7]:
rev = flat[(flat['state_annotations']=='review') & (flat.next_version_annotations.isnull()==True)].drop_duplicates(subset='_id', keep='first').groupby(['crop_name']).size()
rel = flat[(flat['state_annotations']=='relabel') & (flat.next_version_annotations.isnull()==True)].drop_duplicates(subset='_id', keep='first').groupby(['crop_name']).size()
ok = flat[(flat['state_annotations']=='ok') & (flat.next_version_annotations.isnull()==True)].drop_duplicates(subset='_id', keep='first').groupby(['crop_name']).size()

print(rev)
print(rel)
print(ok)

status = pd.concat([rev,rel, ok], axis=1).reset_index()
status.columns = ['Crop','Review', 'Relabel', 'OK']
status

crop_name
CORN        158
COTTON      393
OTHER       232
SOYBEANS    705
dtype: int64
crop_name
CORN          3
COTTON      102
SOYBEANS    239
dtype: int64
crop_name
CORN        14947
COTTON      16214
OTHER         338
SOYBEANS    17383
dtype: int64


Unnamed: 0,Crop,Review,Relabel,OK
0,CORN,158,3.0,14947
1,COTTON,393,102.0,16214
2,OTHER,232,,338
3,SOYBEANS,705,239.0,17383
