In [7]:
from bs4 import BeautifulSoup
from collections import Counter
import hashlib
import json
import pandas as pd
import requests
from tqdm import tqdm
import wikipedia

import os
os.getcwd()

'/Users/seohyeong/Projects/ShroomAI/ShroomAI/preprocess'

### 1. Preprocessing Occurrence.txt

In [76]:
multimedia_txt_path = '../dataset/raw_inat/multimedia.txt'
occurrence_txt_path = '../dataset/raw_inat/occurrence.txt'

train_df_save_path = '../dataset/train_df.txt'
test_df_save_path = '../dataset/test_df.txt'

In [77]:
occ_df = pd.read_csv(occurrence_txt_path, sep = '\t', dtype=str)
occ_df = occ_df[['gbifID', 'catalogNumber', 
                 'year', 'month', 
                 'continent', 'countryCode', 'stateProvince', 'decimalLatitude', 'decimalLongitude', 
                 'taxonID', 'phylum', 'class', 'order', 'family', 'genus', 'taxonKey', 'speciesKey', 'species']]
occ_df = occ_df[occ_df['phylum'].isin(['Basidiomycota', 'Ascomycota'])]
occ_df = occ_df.dropna()

#### 1) Split Train/Test

In [78]:
occ_train_df = occ_df[~occ_df['year'].isin(['2024'])]
occ_test_df = occ_df[occ_df['year'] == '2024']

#### 2) Selecting Species

In [79]:
species_stats = Counter(occ_train_df['species']).most_common()
species_keep = [species for species, _ in species_stats][:1000]
occ_train_df = occ_train_df[occ_train_df['species'].isin(species_keep)]
occ_test_df = occ_test_df[occ_test_df['species'].isin(species_keep)]

### 2. Preprocessing Multimedia.txt

In [80]:
mm_df = pd.read_csv(multimedia_txt_path, sep='\t', dtype=str)
mm_df = mm_df[['gbifID', 'identifier']]
mm_df = mm_df.dropna()
mm_df = mm_df.drop_duplicates(subset=['identifier'])

In [83]:
occ_train_df = pd.merge(occ_train_df, mm_df, on='gbifID', how='inner')
occ_test_df = pd.merge(occ_test_df, mm_df, on='gbifID', how='inner')

In [87]:
def hash_value(value):
    return hashlib.md5(str(value).encode()).hexdigest()
occ_train_df['uniqueID'] = occ_train_df['identifier'].apply(hash_value)
occ_test_df['uniqueID'] = occ_test_df['identifier'].apply(hash_value)

In [90]:
occ_train_df.to_csv(train_df_save_path, sep='\t', index=False)
occ_test_df.to_csv(test_df_save_path, sep='\t', index=False)

### 3. Scraping Metadata

#### 1) Create Meta File with Additional Information

In [None]:
# - taxonID: inat species id (https://www.inaturalist.org/taxa/{taxonID})
# - speciesKey: gbif species id (https://www.gbif.org/species/{speciesKey}

test_df_path = '../dataset/test_df.txt'
meta_json_path = '../dataset/meta.json'

df = pd.read_csv(test_df_path, sep='\t')

species_info = {}

for index, row in df.iterrows():
    if row['species'] not in species_info.keys():
        info = {row['species']: {
            # identifier
            'gbifOccID': row['gbifID'],
            'gbifSpeciesID': row['speciesKey'],
            'inatSpeciesID': row['taxonID'],
            # verbatim
            'phylum': row['phylum'],
            'class': row['class'],
            'order': row['order'],
            'family': row['family'],
            'genus': row['genus'],
        }
                }
        species_info.update(info)
        
with open(meta_json_path, 'w') as outfile: 
    json.dump(species_info, outfile, indent=4)

print(len(species_info)) # shoule match the number of classes

#### 2) Scrape Nickname & Description

In [None]:
with open(meta_json_path, 'r') as file:
    species_info = json.load(file)

not_found = 0

for species, info in tqdm(species_info.items(), total=len(species_info)):
    inat_species_url = 'https://www.inaturalist.org/taxa/{}'.format(info['inatSpeciesID'])
    response = requests.get(inat_species_url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
    
    # get common name
    title_tag = soup.find('title')
    if title_tag:
        title_text = title_tag.get_text()
        common_name = title_text.split(' (')[0]
        species_info[species]['commonName'] = common_name
    else:
        species_info[species]['commonName'] = ''
        print('common name not found.')
    
    # get wiki desc
    try:
        desc = wikipedia.summary(species)
        species_info[species]['desc'] = desc
    except:
        species_info[species]['desc'] = ''
        print("Wikipedia desc not found.")
        not_found += 1
            
print("wiki desc not_found: ", not_found)
with open(meta_json_path, 'w') as outfile: 
    json.dump(species_info, outfile, indent=4)

### 3. Change meta.json Format

In [1]:
with open(meta_json_path, 'r') as file:
    species_info = json.load(file)

In [4]:
flatten_species_info = []

for species, info in species_info.items():
    item = {'species': species}
    item.update(info)
    flatten_species_info.append(item)

In [6]:
with open(meta_json_path, 'w') as outfile: 
    json.dump(flatten_species_info, outfile, indent=4)

## 4. Extract an image for each species

In [13]:
import shutil

dataset_path = '/Users/seohyeong/Projects/ShroomAI/ShroomAI/dataset/inat_300/train'
sample_images_path = '/Users/seohyeong/Projects/ShroomAI/ShroomAI/dataset/sample_images'

# os.mkdir(sample_images_path)

for species in os.listdir(dataset_path):
    species_path = os.path.join(dataset_path, species)
    image_name = os.listdir(species_path)[0]
    example_img_path = os.path.join(species_path, image_name)
    new_example_img_path = os.path.join(sample_images_path, species + ".jpg")
    shutil.copy(example_img_path, new_example_img_path)
    