CHECKLIST DISTRIBUTION: AGGREGATION

In [1]:
import numpy as np
import pandas as pd

# Load checklist

In [2]:
checklist = pd.read_csv('/home/joon/data/checklist_taxonomy_preprocessed.csv',index_col='accepted_plant_name_id')
checklist = checklist[['taxon_status', 'family', 'genus', 'species', 'taxon_name', 'taxon_authors', 'plant_name_id']]
checklist.dropna(axis=0,subset=['species'],inplace=True)
checklist_genus_values = checklist.genus.unique()

In [None]:
checklistdist = pd.read_csv('/home/joon/data/checklist_distribution_preprocessed.csv',index_col='plant_name_id')
checklistdist_plant_name_ids = set(checklistdist.index)

In [4]:
area_codes = checklistdist.area_code_l3.str.upper().unique()
area_codes = area_codes[~pd.isnull(area_codes)]

# Number of observations

In [5]:
def checklist_n_obs(checklist):
    return checklist.shape[0]

# Number of contients

In [6]:
def checklist_n_continents(checklist):
    return checklist.continent_code_l1.nunique()

# Number of regions

In [7]:
def checklist_n_regions(checklist):
    return checklist.region_code_l2.nunique()

# Number of areas. 

In [8]:
def checklist_n_areas(checklist):
    return checklist.area_code_l3.nunique()

# Union of areas: minlat, maxlat, km2

In [9]:
tdwg = pd.read_csv('/home/joon/data/tdwg3.csv')

In [10]:
tdwg_areas = pd.read_csv('tdwg_computed_areas.csv')

In [11]:
def checklist_union_areas_km2(checklist):
    return np.sum(tdwg_areas.loc[tdwg_areas.level3_cod.isin(checklist.area_code_l3.unique()),'m2'])

In [12]:
def checklist_union_areas_min_lat(checklist):
    return np.min(tdwg.loc[tdwg.LEVEL3_COD.isin(checklist.area_code_l3.unique()),'min_lat'])

In [13]:
def checklist_union_areas_max_lat(checklist):
    return np.max(tdwg.loc[tdwg.LEVEL3_COD.isin(checklist.area_code_l3.unique()),'max_lat'])

# histogram areas
TODO: virer plein de areas chelous ?

In [14]:
def checklist_areas_hist(checklist):
    hist = pd.Series(np.full(len(area_codes),0),index=area_codes)
    value_counts = checklist.area_code_l3.str.upper().value_counts()
    hist[value_counts.index] = value_counts
    return hist

# Computation

In [15]:
aggregation_functions = [checklist_n_obs,checklist_n_continents,checklist_n_regions,checklist_n_areas,checklist_union_areas_km2,checklist_union_areas_min_lat,checklist_union_areas_max_lat]
column_names = ['checklist_n_obs','checklist_n_continents','checklist_n_regions','checklist_n_areas','checklist_union_areas_km2','checklist_union_areas_min_lat','checklist_union_areas_max_lat']+['n_in_'+code for code in area_codes]

TODO: Computation for each checklist accepted

In [16]:
def checklist_distribution_based_features_process_genus(genus):
    df = pd.DataFrame([])

    checklist_from_genus = checklist[checklist.genus==genus]
    checklist_from_genus = checklist_from_genus[checklist_from_genus.index == checklist_from_genus.plant_name_id]
    accepted_plant_name_ids_from_genus = checklist_from_genus.index.unique()

    for accepted_plant_name_id in accepted_plant_name_ids_from_genus:
        selected_plant_name_ids = set(checklist.loc[accepted_plant_name_id].plant_name_id)
        selected_plant_name_ids = selected_plant_name_ids.intersection(checklistdist_plant_name_ids)
        selected_distribution = checklistdist.loc[list(selected_plant_name_ids)]

        df = df.append(pd.Series([accepted_plant_name_id]+[f(selected_distribution) for f in aggregation_functions]+list(checklist_areas_hist(selected_distribution))),ignore_index=True)

    if not df.empty:
        df.columns = ['accepted_plant_name_id']+column_names
        df = df.astype({**{'checklist_n_'+foo:'Int64'  for foo in ['obs','continents','regions','areas']},
                        **{'n_in_'+country:'Int64' for country in area_codes}})
        df.to_csv('/home/joon/data/checklist-distribution-based-features-'+genus+'.csv',header=False,index=False)

In [17]:
def checklist_distribution_based_features_compute(selected_distribution):
    return [f(selected_distribution) for f in aggregation_functions]+list(checklist_areas_hist(selected_distribution))

In [451]:
os.system('find /home/joon/data -type f -name \'checklist-distribution-based-features*.csv\' | xargs rm')

from multiprocessing import Pool
with Pool(processes=11) as pool:
    pool.map(checklist_distribution_based_features_process_genus, checklist_genus_values)

pd.DataFrame([['accepted_plant_name_id']+column_names]).to_csv('/home/joon/data/checklist-distribution-based-features_column-names.csv',header=None,index=None)

#fusionner les résultats et supprimer les fichiers temporaires!
os.system('find /home/joon/data -type f -name \'checklist-distribution-based-features-*.csv\' | xargs cat /home/joon/data/checklist-distribution-based-features_column-names.csv > /home/joon/data/checklist-distribution-based-features.csv')
os.system('find /home/joon/data -type f -name \'checklist-distribution-based-features-*.csv\' | xargs rm')

0

Computation for each iucn taxon index

In [18]:
def read_set_in_csv(x):
    if x=='set()':
        return set()
    else:
        return set(x.replace("'",'').strip('{}').split(', '))

In [19]:
iucn_checklist_matching = pd.read_csv('/home/joon/data/iucn-checklist-matching.csv',index_col=0,header=None,squeeze=True,converters={1:read_set_in_csv})

In [20]:
df = pd.DataFrame([])

for iucn_taxon_index in iucn_checklist_matching.index:
    selected_accepted_plant_name_ids = iucn_checklist_matching.loc[iucn_taxon_index]
    selected_plant_name_ids = set(checklist.loc[selected_accepted_plant_name_ids].plant_name_id)
    selected_plant_name_ids = selected_plant_name_ids.intersection(checklistdist_plant_name_ids)
    selected_distribution = checklistdist.loc[list(selected_plant_name_ids)]

    if selected_plant_name_ids != set():
        df = df.append(pd.Series([iucn_taxon_index]+checklist_distribution_based_features_compute(selected_distribution)),ignore_index=True)
    else:
        df = df.append(pd.Series([iucn_taxon_index]+[np.nan for i in column_names]),ignore_index=True)

df.columns = ['iucn_taxon_index']+column_names

df = df.astype({**{'iucn_taxon_index':int},
                **{'checklist_n_'+foo:'Int64'  for foo in ['obs','continents','regions','areas']},
                **{'n_in_'+country:'Int64' for country in area_codes}})

df.to_csv('/home/joon/data/checklist-distribution-based-features-for-iucn.csv',index=False)