In [1]:
import pandas as pd
import time
import numpy as np
from matplotlib import pyplot as plt, rcParams

import brtdevkit
from brtdevkit.core.db import DBConnector, DatetimeFilter
from brtdevkit.core.db.db_filters import *  # We need this for pre-defined filters, e.g., ProjectFilter, DatetimeFilter

from warnings import filterwarnings
filterwarnings("ignore")

def get_shasta_data(filters={}, start=None, end=None, limit=None):
    """
    Query relevant Shasta data for calculations. 
    """
    start_time = time.time()
    connector = DBConnector()
    img_filters = {'project_name': 'shasta', **filters}
    if start is not None or end is not None:
        img_filters = [img_filters, DatetimeFilter(key="collected_on", start=start, end=end)]
    df = connector.get_documents_df('image', img_filters, limit=limit)
    elapsed_time = time.time() - start_time
    return df, elapsed_time

start = datetime(2021,3,29) # The first field was collected on 3/30/2021
end = datetime(2021, 10, 1)

filters = {"artifacts.kind": "nrg", 
           'has_human_annotation': True
          }

corn_df, elapsed_time = get_shasta_data(filters=filters, start=start, end=end)
corn_df['grower_farm_field'] = corn_df['grower'] + '_' + corn_df['farm'] + '_' + corn_df['operating_field_name']
print(f"Queried {len(corn_df)} images in {elapsed_time:.2f} s.")

Queried 32652 images in 45.72 s.


In [15]:
len(corn_df[corn_df['grower']=='jim whitaker'])

23

In [4]:
algae_fields =pd.read_csv('algae_fields.csv')
algae_fields['grower_farm_field'] = algae_fields['grower'] + '_' + algae_fields['farm'] + '_' + algae_fields['operating_field_name']

algae = corn_df[corn_df['grower_farm_field'].isin(algae_fields.grower_farm_field.unique())]
print(len(algae))

2586


In [8]:
# Flatten annotations to make them easily searchable

def flatten_annotations(dataframe):
    r = pd.DataFrame(
        {
            col:np.repeat(dataframe[col].values, dataframe['annotations'].str.len())
            for col in dataframe.columns.drop('annotations')
        }).assign(**{'annotations':np.concatenate(dataframe['annotations'].values)})[dataframe.columns]
    w = pd.concat([
        r.drop(['annotations'], axis=1),
        r['annotations'].apply(pd.Series).add_suffix('_annotations')
    ],
    axis=1)
    # Return filtered dataframe with each annotation artifact as a single row. 
    # Filter out ndvi and machine labels, as well as labels in a state other than ok. 
    return w[(w['kind_annotations'] != 'machine') & (w['kind_annotations'] != 'ndvi_mask') & (w['state_annotations']=='ok')]

flat = flatten_annotations(algae)
print(f'There are {len(flat)} total annotations from {len(flat._id.unique())} unique image ids')

There are 2875 total annotations from 2381 unique image ids


In [9]:
flat = flat[flat.next_version_annotations.isnull()==True]
flat.next_version_annotations.isnull().value_counts()

True    2378
Name: next_version_annotations, dtype: int64

In [10]:
# Sort for the most recent annotations of each id
flat = flat.sort_values(by='created_at_annotations', ascending =True, axis=0)

# Romove any duplicate image ids
# annotations_final is now our master list of annotations used to create the annotation datasets
annotations_final = flat.drop_duplicates(subset=['_id'], keep='last')
print(f'There are {len(annotations_final)} annotated images that are valid.')

There are 2378 annotated images that are valid.


In [11]:

# Verify all annotations are pixelwise style 
annotations_final.style_annotations.value_counts()

pixelwise    2378
Name: style_annotations, dtype: int64

In [12]:
# Creating string versions of some metadata fields to be able to search through them. (There's probably a better way to do this)
annotations_final['nv'] = annotations_final.next_version_annotations.astype(str)
annotations_final['lm'] = annotations_final.label_map_annotations.astype(str)
annotations_final['s3k'] = annotations_final.s3_key_annotations.apply(str)
annotations_final['s3b'] = annotations_final.s3_bucket_annotations.apply(str)

# There are now no images with nans for label_map, s3_bucket and s3_key
print(len(annotations_final[(annotations_final['lm']== 'nan') | (annotations_final['s3k']== 'nan') | (annotations_final['s3b']== 'nan')]))
# There are no annotations with next_version is not null (meaning there is no newer version)
print(len(annotations_final[annotations_final.nv.isnull()==True]))

# This cell should output zeros and only one label map. Otherwise, there is something wrong. 
annotations_final['lm'].value_counts()

0
0


{'1': 'weed', '2': 'crop'}    2378
Name: lm, dtype: int64

In [13]:
# Look at which label policies were used

annotations_final['lp'] = annotations_final.label_policy_annotations.astype(str)

# This next line removes a cotton field that was mislabeled as corn
#annotations_final = annotations_final[annotations_final['lp']!='6083224ccfbd2a9d788f2988']
# Should now just be all '607653583f4e7e93319cc306' which is the corn label policy
print(annotations_final.lp.value_counts())

607653583f4e7e93319cc306    2378
Name: lp, dtype: int64


In [23]:
len(annotations_final._id_annotations)

2378

In [29]:
len(annotations_final.grower_farm_field.value_counts())

13

In [24]:
al_list = annotations_final['_id_annotations']
al_list = list(al_list.astype(str))
print(len(al_list))
al_list[0:5]

2378


['60888f1f318c48b48b9c89c9',
 '6088919d5d2e672a72eb8014',
 '6088919d5d2e672a72eb8062',
 '608892329ac9ec1af55cd2b5',
 '608892339ac9ec1af55cd2df']

In [25]:
from brtdevkit.data import Dataset

# Enter a name and description
dataset_name= '061621_Corn_Algae_Set'
description = 'A dataset of corn fields with moderate to severe algae.'

Dataset.create(
    name= dataset_name,
    description = description,
    kind = Dataset.KIND_ANNOTATION,
    annotation_ids = al_list
    )

{}

In [31]:
# After a few seconds, retreive the dataset to verify it has been created properly
dset = Dataset.retrieve(name = dataset_name)
len(dset.to_dataframe())

2378