In [1]:
import pandas as pd
import os

In [2]:
input_dir = "./input/"

df_all_current_year = pd.read_csv(
    input_dir + "current_year_obs_merged.csv"
)

df_taxonomy = pd.read_csv(
    input_dir + "ebird-taxonomy.csv"
)

df_subnational1_names = pd.read_csv(
    input_dir + "subnational1.csv"
)

df_subnational1_boundaries = pd.read_csv(
    input_dir + "subnational1_boundaries.csv"
)

print("All Columns for Current Year Observations: ")
print(df_all_current_year.columns)
print("All Columns for Taxonomy: ")
print(df_taxonomy.columns)
print("All Columns for Subnational 1: ")
print(df_subnational1_names.columns)
print("All Columns for Subnational 1 Boundaries: ")
print(df_subnational1_boundaries.columns)

All Columns for Current Year Observations: 
Index(['speciesCode', 'comName', 'sciName', 'locId', 'locName', 'obsDt',
       'howMany', 'lat', 'lng', 'obsValid', 'obsReviewed', 'locationPrivate',
       'subId', 'subnational2Code', 'subnational2Name', 'subnational1Code',
       'subnational1Name', 'countryCode', 'countryName', 'userDisplayName',
       'obsId', 'checklistId', 'presenceNoted', 'hasComments', 'firstName',
       'lastName', 'hasRichMedia', 'exoticCategory'],
      dtype='object')
All Columns for Taxonomy: 
Index(['SCIENTIFIC_NAME', 'COMMON_NAME', 'SPECIES_CODE', 'CATEGORY',
       'TAXON_ORDER', 'COM_NAME_CODES', 'SCI_NAME_CODES', 'BANDING_CODES',
       'ORDER', 'FAMILY_COM_NAME', 'FAMILY_SCI_NAME', 'REPORT_AS', 'EXTINCT',
       'EXTINCT_YEAR', 'FAMILY_CODE'],
      dtype='object')
All Columns for Subnational 1: 
Index(['REGION_CODE', 'REGION_NAME'], dtype='object')
All Columns for Subnational 1 Boundaries: 
Index(['region_code', 'region', 'minX', 'maxX', 'minY', 'maxY'

In [3]:
# Missing family_description (web scraping)
family = [
    "FAMILY_CODE",
    "FAMILY_SCI_NAME",
    "FAMILY_COM_NAME"
]

# Missing species_description, species_img (web scraping)
species = [
    "SPECIES_CODE", 
    "FAMILY_CODE", 
    "SCIENTIFIC_NAME",
    "COMMON_NAME", 
    "EXTINCT",
    "EXTINCT_YEAR"
]

# using subId for user_id
ebird_user = [
    "subId",
    "firstName",
    "lastName",
    "userDisplayName"
]

subnational1_names = [
    "REGION_CODE",
    "REGION_NAME"
]

subnational1_boundaries = [
    "region_code",
    "minX",
    "maxX",
    "minY",
    "maxY"
]

subnational1 = [
    "REGION_CODE",
    "REGION_NAME",
    "minX",
    "maxX",
    "minY",
    "maxY"
]

subnational2 = [
    "subnational2Code",
    "subnational1Code",
    "subnational2Name"
]

ebird_location = [
    "locId",
    "subnational2Code",
    "locName",
    "lat",
    "lng",
    "locationPrivate"
]

observation = [
    "obsId",
    "speciesCode",
    "subId",
    "locId",
    "obsDt",
    "howMany",
    "obsValid",
    "obsReviewed"
]

In [4]:
df_family_wo_descriptions = \
    df_taxonomy[family].drop_duplicates()

df_species_wo_descriptions_n_img = \
    df_taxonomy[species].drop_duplicates()

df_ebird_user = df_all_current_year[ebird_user]\
    .drop_duplicates()

df_subnational1 = df_subnational1_names[subnational1_names]\
    .merge(
        df_subnational1_boundaries[subnational1_boundaries],
        how="left",
        left_on="REGION_CODE",
        right_on="region_code"
    )\
    .drop_duplicates()[subnational1]

df_subnational2 = df_all_current_year[subnational2]\
    .drop_duplicates()

df_ebird_location = df_all_current_year[ebird_location]\
    .drop_duplicates()

df_observation = df_all_current_year[observation]\
    .drop_duplicates()

output_dir = "./decomposed/"

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

df_family_wo_descriptions.to_csv(
    output_dir + "ebird_family_without_descriptions.csv",
    index=False,
    header=True
)

df_species_wo_descriptions_n_img.to_csv(
    output_dir + \
        "ebird_species_without_descriptions_n_img.csv",
    index=False,
    header=True
)

df_ebird_user.to_csv(
    output_dir + "ebird_user.csv",
    index=False,
    header=True
)

df_subnational1.to_csv(
    output_dir + "ebird_subnational1.csv",
    index=False,
    header=True
)

df_subnational2.to_csv(
    output_dir + "ebird_subnational2.csv",
    index=False,
    header=True
)

df_ebird_location.to_csv(
    output_dir + "ebird_location.csv",
    index=False,
    header=True
)

df_observation.to_csv(
    output_dir + "ebird_observation.csv",
    index=False,
    header=True
)