## Track Which Weeds We are Identifying in the Field

Every image has a list of identified weeds in the metadata.

This list can change within a single field if they move to a place where there are different weeds.

In [1]:
# Import libraries and query Aletheia

import time
import numpy as np
import pandas as pd
from datetime import datetime as dt
from brtdevkit.core.db import DBConnector, DatetimeFilter
from matplotlib import pyplot as plt, rcParams

def get_shasta_data(filters={}, start=None, end=None, limit=None):
    """
    Query relevant Shasta data for calculations. Written by Andrei P.
    """
    start_time = time.time()
    connector = DBConnector()
    img_filters = {'project_name': 'shasta', **filters}
    if start is not None or end is not None:
        img_filters = [img_filters, DatetimeFilter(key="collected_on", start=start, end=end)]
    df = connector.get_documents_df('image', img_filters, limit=limit)
    elapsed_time = time.time() - start_time
    return df, elapsed_time

# Set filters for query
filters = {"artifacts.kind": "nrg", 
           "crop_name": {"$in": ['CORN', 'COTTON', 'SOYBEANS']}, 
           "robot_name": {"$in": ["DCM-MANATEE","DCM-OTTER", "DCM-DOLPHIN", "DCM-WALRUS", "DCM-PORPOISE", "DCM-SEAL"]} }

# Set start date
start = dt(2020, 3,7)

full_df, elapsed_time = get_shasta_data(filters=filters, start=start)
full_df['date_collected'] = pd.to_datetime(full_df['collected_on'].dt.date)
print(f"Queried {len(full_df)} images in {elapsed_time:.2f} s.")

Queried 521654 images in 364.39 s.


In [2]:
# Creates one-hot encoded df of weeds tagged in each image
# sklearn's MLB tool is incredibly fast at one-hot encoding items stored as a list in a column of a pandas dataframe
from sklearn.preprocessing import MultiLabelBinarizer

# Create copy of queried dataframe
df = full_df.copy()

# Filter out non-target crops, if not already done above
df = df[df['crop_name'].isin(['CORN', 'COTTON', 'SOYBEANS'])]

# Remove zero values for latitude and longitude
df = df[(df['latitude'] != 0) &df['longitude'] != 0]

# One-hot encoding each unique weed
s = df['weeds']
mlb = MultiLabelBinarizer()
w= pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_, index=df.index)
l = df[['grower', 'farm','operating_field_name','crop_name','latitude', 'longitude']]
weed_df = pd.merge(w, l, left_index=True, right_index=True)

# Aggregate fields, calculating mean lat and long of each field
weed_map_df = weed_df.groupby(['grower','farm','operating_field_name','crop_name']).mean().dropna().reset_index()

# create index for later merging
weed_map_df_index = weed_map_df['crop_name']

# select columns containing jsut the weed values
weed_values = weed_map_df.drop(['grower','farm','operating_field_name','crop_name', 'latitude', 'longitude'], axis=1)
print(f'There are {len(weed_values.columns)} unique weeds tagged in the image metadata.')

# Round up to 1 any weeds found in only part of a field
weed_values = weed_values.apply(np.ceil)

print(f'The number of unique fields is {len(weed_map_df)}')
print(f'The total number of images is {len(weed_df)}')

There are 66 unique weeds tagged in the image metadata.
The number of unique fields is 742
The total number of images is 521054


In [3]:
grower_locs = weed_map_df[['grower', 'latitude', 'longitude', 'crop_name']]
grower_locs = grower_locs[grower_locs['crop_name']=='SOYBEANS']
grower_locs.sort_values('latitude')[45:55]

Unnamed: 0,grower,latitude,longitude,crop_name
564,jason betts,35.330911,-78.808178,SOYBEANS
562,jason betts,35.331982,-78.809223,SOYBEANS
567,jason betts,35.341512,-78.841602,SOYBEANS
518,greenway coop,35.394866,-91.252154,SOYBEANS
528,greenway coop,35.400172,-91.232692,SOYBEANS
515,greenway coop,35.403123,-91.261225,SOYBEANS
514,greenway coop,35.403136,-91.261214,SOYBEANS
526,greenway coop,35.404755,-91.256306,SOYBEANS
521,greenway coop,35.405872,-91.258298,SOYBEANS
524,greenway coop,35.409177,-91.261772,SOYBEANS


In [21]:
# Identify and count fields containing high priority weeds

# List of High Priority Weeds
# Weeds can have several different common names, so make sure these are the weeds you're looking for as listed in the metadata.
target_weeds = ['PALMER_AMERANTH', 'WATERHEMP', 'HORSEWEED', 'GIANT_RAGWEED', 'KOCHIA', 'NUTSEDGE', 'MORNING_GLORY', 'GOOSEGRASS']

# Create crop_name index
field_counts = pd.Series(df.crop_name.unique())
ci = ['CORN', 'COTTON', 'SOYBEANS']

# Count fields with each target weed
for w in target_weeds:
    wdf = weed_df[weed_df[w]>0]
    crops = []
    for c in ci:
        t_weed = wdf[wdf['crop_name']==c]
        if len(t_weed)==0:
            crops.append(0)
        else:
            crops.append(len(t_weed.operating_field_name.unique()))
    field_counts = pd.concat([field_counts, pd.Series(crops, name=w)], axis=1)

field_counts

Unnamed: 0,0,PALMER_AMERANTH,WATERHEMP,HORSEWEED,GIANT_RAGWEED,KOCHIA,NUTSEDGE,MORNING_GLORY,GOOSEGRASS
0,CORN,36,19,6,15,5,2,10,1
1,COTTON,90,1,13,1,15,4,67,2
2,SOYBEANS,102,83,50,49,13,17,66,7


##  Use the next few cells to build the table for the weed distribution data spreadsheet: 
https://docs.google.com/spreadsheets/d/1w3o2keBb6rBH_Lmqqfo20EyAP5j6qsz3TIsQKfjaCc0/edit#gid=1758516013

This process could be improved

In [4]:
# Adding State Labels to weed_map_df Through Reverse Geocoding

# Select just coordinates 
coords = weed_map_df[['latitude', 'longitude']].reset_index()

# Format coordinates into proper string format
coords.latitude=coords.latitude.astype(str)
coords.longitude=coords.longitude.astype(str)
coords['latlongstring'] = coords['latitude'] + ', '  + coords['longitude']

# Using Geopy and coordinates, assign a state label to each field
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="geoapiExercises")

def state(coord):
    """
    This function takes the coordinates and returns the state. 
    It uses a web service that is rather slow and sometimes unreliable.
    """
    location = geolocator.reverse(coord, exactly_one=True)
    address = location.raw['address']
    state = address.get('state', '')
    return  state

In [6]:
# Geopandas version, which I never got to work

import geopandas 

us_states = geopandas.read_file('maps/states.shp')
print(us_states.head())
coord_list = coords.latlongstring.tolist()

us_states['state_name'] = us_states['STATE_NAME']
non_mainland_filter = (us_states.STATE_ABBR.isin(['HI', 'AK']))
mainland_states = us_states[~non_mainland_filter].copy()

def get_valid_geo(df):
    valid_geo = df[
        ~df.latitude.isnull() &
        ~df.longitude.isnull() &
        (df.latitude != 0) &
        (df.longitude != 0) 
    ]
    return valid_geo

valid_geo = get_valid_geo(weed_map_df)
df.shape, valid_geo.shape

gdf = geopandas.GeoDataFrame(coord_list, geometry=geopandas.points_from_xy(valid_geo.longitude, valid_geo.latitude))
gdf.crs = {'init' :'epsg:4326'}
gdf = gdf.to_crs({'init': 'epsg:4269'})

#print(gdf.head())

merged = geopandas.sjoin(gdf, us_states, how='left', op='intersects')

     STATE_NAME  DRAWSEQ STATE_FIPS          SUB_REGION STATE_ABBR  \
0        Hawaii        1         15             Pacific         HI   
1    Washington        2         53             Pacific         WA   
2       Montana        3         30            Mountain         MT   
3         Maine        4         23         New England         ME   
4  North Dakota        5         38  West North Central         ND   

                                            geometry  
0  MULTIPOLYGON (((-160.07380 22.00418, -160.0497...  
1  MULTIPOLYGON (((-122.40202 48.22522, -122.4628...  
2  POLYGON ((-111.47543 44.70216, -111.48080 44.6...  
3  MULTIPOLYGON (((-69.77728 44.07415, -69.85993 ...  
4  POLYGON ((-98.73044 45.93827, -99.00683 45.939...  


  return _prepare_from_string(" ".join(pjargs))
  "(%s != %s)" % (left_df.crs, right_df.crs)
  warn("Cannot generate spatial index: Missing package `rtree`.")


AttributeError: 'NoneType' object has no attribute 'intersection'

In [19]:
## This cell will take several minutes to run because of the reverse geolocation service
# Need to implement this step with a more reliable service, e.g. Geopandas

coord_list = coords.latlongstring.tolist()
print(f'There are {len(coord_list)} fields to label.\n')

state_labels_master = []

# If the field list does not finish, reset the start variable to where it stopped
start = 0
while len(state_labels_master)<len(coord_list):
    try:
        for i in range(start, len(coord_list)):
            state_labels_master.append(state(coord_list[i]))
            time.sleep(2) # Keeps the service from becoming backed up and timing out
        print(f'Process completed. There are {len(state_labels_master)} fields localized.')
    except Exception:
        start=len(state_labels_master)
        print(f'{(len(state_labels_master)/len(coord_list))*100:.1f}% Finished')

There are 742 fields to label.

20.75% Finished
25.07% Finished
25.34% Finished
28.44% Finished
30.19% Finished
31.67% Finished
33.02% Finished
34.23% Finished
35.44% Finished
37.20% Finished
39.22% Finished
39.76% Finished
41.24% Finished
41.24% Finished
43.26% Finished
49.60% Finished
65.63% Finished
65.63% Finished
67.25% Finished
67.25% Finished
67.65% Finished
67.65% Finished
67.65% Finished
67.65% Finished
68.06% Finished
68.06% Finished
69.14% Finished
69.14% Finished
69.41% Finished
69.41% Finished
71.43% Finished
71.43% Finished
71.43% Finished
Process completed. There are 742 fields localized.
Finished!


In [20]:
# Final df merging weeds data and state labels
# This creates the final table for import into gsheets, located at: 
# https://docs.google.com/spreadsheets/d/1w3o2keBb6rBH_Lmqqfo20EyAP5j6qsz3TIsQKfjaCc0/edit#gid=1758516013

states = pd.Series(state_labels_master, name='state')
print(weed_map_df.columns)
weeds = pd.concat([weed_map_df_index, weed_values, states], axis=1)

# Aggregate by state
state_weeds = weeds.groupby(['crop_name', 'state']).sum()

# Export to a csv for import into sheets
state_weeds.to_csv('state_weed_data.csv')
print(f'The final shape of the table is {state_weeds.shape}')

Index(['grower', 'farm', 'operating_field_name', 'crop_name',
       'ASIATIC_DAYFLOWER', 'BARNYARDGRASS', 'BINDWEED', 'BLUE_WEED',
       'BROADLEAF_SIGNALGRASS', 'BUFFALOBUR', 'CANADA_THISTLE', 'CARPETWEED',
       'CHEAT', 'COCKLEBUR', 'COFFEE_SENNA', 'COMMON_RAGWEED', 'COPPER_LEAF',
       'CRABGRASS', 'CURLY_DOCK', 'DANDELION', 'DEVILS_CLAW', 'ECLIPTA',
       'GERANIUM', 'GIANT_FOXTAIL', 'GIANT_RAGWEED', 'GOOSEGRASS',
       'GREEN_FOXTAIL', 'HENBIT', 'HORSEWEED', 'ITALIAN_RYEGRASS',
       'JOHNSON_GRASS', 'KOCHIA', 'LAMBSQUARTERS', 'MELON', 'MILKWEED',
       'MORNING_GLORY', 'MUSTARD', 'NONE', 'NUTSEDGE', 'OTHER',
       'OTHER_BROADLEAF', 'OTHER_GRASS', 'PALMER_AMERANTH',
       'PENNSYLVANIA_SMARTWEED', 'PRICKLY_SIDA', 'PUNCTUREVINE', 'PURSELANE',
       'QUACKGRASS', 'REDROOT_PIGWEED', 'REDVINE', 'RICE_VARIOUS',
       'RUSSIAN_THISTLE', 'SANDBUR', 'SHATTERCANE', 'SHEPARDS_PURSE',
       'SICKLEPOD', 'SIGNALGRASS', 'SILVERLEAF_NIGHTSHADE', 'SMELLMELLON',
       'SPURGE', 'S