# Data Preperation

## 1. Import libraries

In [43]:
import pandas as pd

In [56]:
data = pd.read_csv('observation.csv')
data

Unnamed: 0,id,image_url,scientific_name,common_name,iconic_taxon_name
0,39,https://inaturalist-open-data.s3.amazonaws.com...,Taricha torosa,California Newt,Amphibia
1,40,https://inaturalist-open-data.s3.amazonaws.com...,Taricha torosa,California Newt,Amphibia
2,80,https://inaturalist-open-data.s3.amazonaws.com...,Callisaurus draconoides,Zebra-tailed Lizard,Reptilia
3,203,http://static.inaturalist.org/photos/132/mediu...,Arctia virginalis,Ranchman's Tiger Moth,Insecta
4,523,https://inaturalist-open-data.s3.amazonaws.com...,Malacosoma disstria,Forest Tent Caterpillar Moth,Insecta
...,...,...,...,...,...
303675,156384671,https://static.inaturalist.org/photos/27041880...,Duttaphrynus melanostictus,Asian Common Toad,Amphibia
303676,156387550,https://inaturalist-open-data.s3.amazonaws.com...,Bufo bufo,Gewone Pad,Amphibia
303677,156391245,https://static.inaturalist.org/photos/27043003...,Crotalus adamanteus,Eastern Diamondback Rattlesnake,Reptilia
303678,156392005,https://inaturalist-open-data.s3.amazonaws.com...,Hyla arborea,Boomkikker,Amphibia


## 2. Check empty values in image_url.

I am going to check whether there are any rows with the same scientific_name, where one of the rows has an image URL and the other does not. This will allow me to transfer the URL from the row containing the URL to the row with the empty URL.

In [57]:
# Count the number of empty 'image_url' values
num_empty_image_urls = data['image_url'].isna().sum()

# Print the number of empty 'image_url' values
print(f"Number of empty 'image_url' values: {num_empty_image_urls}")


Number of empty 'image_url' values: 1032


In [58]:
# Find the rows with empty 'image_url' values
empty_image_url_rows = data.loc[data['image_url'].isna()]

# Iterate through the empty 'image_url' rows
for index, row in empty_image_url_rows.iterrows():
    # Find a row with the same 'scientific_name' and non-empty 'image_url'
    matching_row = data.loc[
        (data['scientific_name'] == row['scientific_name']) &
        (~data['image_url'].isna())
    ].head(1)

    # If a matching row is found
    if not matching_row.empty:
        # Copy the 'image_url' value from the matching row to the current row
        data.loc[index, 'image_url'] = matching_row['image_url'].values[0]

# Print the updated dataset
data

Unnamed: 0,id,image_url,scientific_name,common_name,iconic_taxon_name
0,39,https://inaturalist-open-data.s3.amazonaws.com...,Taricha torosa,California Newt,Amphibia
1,40,https://inaturalist-open-data.s3.amazonaws.com...,Taricha torosa,California Newt,Amphibia
2,80,https://inaturalist-open-data.s3.amazonaws.com...,Callisaurus draconoides,Zebra-tailed Lizard,Reptilia
3,203,http://static.inaturalist.org/photos/132/mediu...,Arctia virginalis,Ranchman's Tiger Moth,Insecta
4,523,https://inaturalist-open-data.s3.amazonaws.com...,Malacosoma disstria,Forest Tent Caterpillar Moth,Insecta
...,...,...,...,...,...
303675,156384671,https://static.inaturalist.org/photos/27041880...,Duttaphrynus melanostictus,Asian Common Toad,Amphibia
303676,156387550,https://inaturalist-open-data.s3.amazonaws.com...,Bufo bufo,Gewone Pad,Amphibia
303677,156391245,https://static.inaturalist.org/photos/27043003...,Crotalus adamanteus,Eastern Diamondback Rattlesnake,Reptilia
303678,156392005,https://inaturalist-open-data.s3.amazonaws.com...,Hyla arborea,Boomkikker,Amphibia


### Check if there are still empty values in the Image_url.

I will verify whether there are any remaining empty values in the Image_url column. If there are, I will exclude those rows as they will not be useful for image classification.

In [59]:
# Count the number of empty 'image_url' values
num_empty_image_urls = data['image_url'].isna().sum()

# Print the number of empty 'image_url' values
print(f"Number of empty 'image_url' values: {num_empty_image_urls}")


Number of empty 'image_url' values: 14


As there are still 14 rows that lack an image URL, my next step is to remove those rows

In [61]:
# Remove rows with empty 'image_url' values
data = data.dropna(subset=['image_url'])

# Count the number of empty 'image_url' values
num_empty_image_urls = data['image_url'].isna().sum()

# Print the number of empty 'image_url' values
print(f"Number of empty 'image_url' values: {num_empty_image_urls}")


Number of empty 'image_url' values: 0


## Check duplicate values

In [39]:
duplicates = data['scientific_name'].duplicated()
duplicate_count = duplicates.sum()
print(f"There are {duplicate_count} duplicate values in the 'scientific_name' column.")


There are 282484 duplicate values in the 'scientific_name' column.


In [42]:
duplicate_rows = data[duplicates]
duplicate_rows


Unnamed: 0,id,scientific_name,common_name,iconic_taxon_name,has_common_name
1,40.0,Taricha torosa,California Newt,Amphibia,True
13,2312.0,Taricha torosa,California Newt,Amphibia,True
20,3415.0,Hypsiglena ochrorhynchus nuchalata,California Nightsnake,Reptilia,True
24,5603.0,Taricha torosa,California Newt,Amphibia,True
25,5998.0,Anguis fragilis,Hazelworm,Reptilia,True
...,...,...,...,...,...
303675,156384671.0,Duttaphrynus melanostictus,Asian Common Toad,Amphibia,True
303676,156387550.0,Bufo bufo,Gewone Pad,Amphibia,True
303677,156391245.0,Crotalus adamanteus,Eastern Diamondback Rattlesnake,Reptilia,True
303678,156392005.0,Hyla arborea,Boomkikker,Amphibia,True


In [40]:
import numpy as np

def check_common_name_variations(group):
    has_nan = group['common_name'].isna().any()
    has_value = group['common_name'].notna().any()
    return has_nan and has_value

groups = data.groupby('scientific_name')
duplicates_with_varied_common_names = groups.apply(check_common_name_variations)
duplicates_with_varied_common_names = duplicates_with_varied_common_names[duplicates_with_varied_common_names]

print(f"There are {len(duplicates_with_varied_common_names)} scientific names with both common name and missing common name.")
print(duplicates_with_varied_common_names)


There are 1 scientific names with both common name and missing common name.
scientific_name
Leptidea sinapis    True
dtype: bool
