In [1]:
import pandas as pd
import numpy as np
import os

# Nb observations

In [2]:
def herbier_n_obs(herbier):
    return herbier.shape[0]

# Nb synonymes
TODO: c'est bien la bonne colonne?

In [3]:
def herbier_n_taxon(herbier):
    return herbier.index.nunique()

# Nb SECTEUR

In [4]:
def herbier_n_secteurs(herbier):
    return herbier.SECTEUR.nunique()

# Nb COD_ISO

In [5]:
def herbier_n_pays(herbier):
    return herbier.COD_ISO.nunique()

# Stats de years {max,min,mean,med,max-min,today-max,today-max}

In [6]:
def herbier_max_date(herbier):
    return np.max(herbier.DAT_REC_A)

In [7]:
def herbier_min_date(herbier):
    return np.min(herbier.DAT_REC_A)

In [8]:
def herbier_mean_date(herbier):
    return np.mean(herbier.DAT_REC_A)

In [9]:
def herbier_delta_date(herbier):
    return herbier_max_date(herbier)-herbier_min_date(herbier)

In [10]:
def herbier_age_last_obs(herbier):
    return 2020-herbier_max_date(herbier)

In [11]:
def herbier_age_first_obs(herbier):
    return 2020-herbier_min_date(herbier)

In [12]:
date_hist_bins = range(1700,2021,20)
def herbier_date_hist(herbier):
    return np.histogram(herbier.DAT_REC_A,bins=date_hist_bins)[0] 

# Number of observations in last: 5 years, 10 years, 20 years, 50 years, all time

In [13]:
def herbier_n_obs_last_m_years(herbier,years):
    return np.sum(np.histogram(herbier.DAT_REC_A,bins=range(1970,2020))[0][-years:])

In [14]:
def herbier_n_obs_last_5y(herbier):
    return herbier_n_obs_last_m_years(herbier,5)

In [15]:
def herbier_n_obs_last_10y(herbier):
    return herbier_n_obs_last_m_years(herbier,10)

In [16]:
def herbier_n_obs_last_20y(herbier):
    return herbier_n_obs_last_m_years(herbier,20)

In [17]:
def herbier_n_obs_last_50y(herbier):
    return herbier_n_obs_last_m_years(herbier,50)

# Computation

In [23]:
herbier = pd.read_csv('/home/joon/data/herbier.csv',sep=',',encoding='iso-8859-1',usecols=['TAXONPK','SECTEUR','COD_ISO','DAT_REC_A'],index_col='TAXONPK')
# herbier = herbier[['TAXONPK','SECTEUR','COD_ISO','DAT_REC_A']]
# herbier.set_index('TAXONPK',inplace=True)

In [25]:
herbier.DAT_REC_A.replace(to_replace=0.0,value=np.nan,inplace=True)

In [18]:
aggregation_functions = [herbier_n_obs,herbier_n_taxon,herbier_n_secteurs,herbier_n_pays,herbier_max_date,herbier_min_date,herbier_mean_date,herbier_delta_date,herbier_age_last_obs,herbier_age_first_obs,herbier_n_obs_last_5y,herbier_n_obs_last_10y,herbier_n_obs_last_20y,herbier_n_obs_last_50y]
column_names = ['herbier_n_obs','herbier_n_taxon','herbier_n_secteurs','herbier_n_pays','herbier_max_date','herbier_min_date','herbier_mean_date','herbier_delta_date','herbier_age_last_obs','herbier_age_first_obs','herbier_n_obs_last_5y','herbier_n_obs_last_10y','herbier_n_obs_last_20y','herbier_n_obs_last_50y']

# Features for checklist accepted_plant_id

In [27]:
checklist = pd.read_csv('/home/joon/data/checklist_taxonomy_preprocessed.csv',index_col='accepted_plant_name_id')
checklist = checklist[['genus','species', 'plant_name_id']]
checklist.dropna(axis=0,subset=['species'],inplace=True)
checklist_genus_values = checklist.genus.unique()

In [110]:
def read_set_in_csv(x):
    if x=='set()':
        return set()
    else:
        return set(x.strip('{}\'"').split(', '))

In [95]:
checklist_accepted_herbier_taxonpk_matching = pd.read_csv('/home/joon/data/checklist-accepted-herbier-taxonpk-matching.csv',index_col=0,header=None,squeeze=True,converters={1:read_set_in_csv})

In [30]:
def herbier_based_features(accepted_plant_name_ids,taxonpks):
    selected_distribution = herbier.loc[taxonpks]
    return [f(selected_distribution) for f in aggregation_functions]+list(herbier_date_hist(selected_distribution))

In [179]:
def herbier_based_features_process_genus(genus):
    features = pd.DataFrame([])

    checklist_from_genus = checklist[checklist.genus==genus]
    checklist_from_genus = checklist_from_genus[checklist_from_genus.index == checklist_from_genus.plant_name_id]
    accepted_plant_name_ids_from_genus = checklist_from_genus.index.unique()

    for accepted_plant_name_id in accepted_plant_name_ids_from_genus:
        selected_distribution = pd.DataFrame(columns=['SECTEUR','COD_ISO','DAT_REC_A'])

        taxonpks = checklist_accepted_herbier_taxonpk_matching.loc[accepted_plant_name_id]
        taxonpks = [int(taxonpk) for taxonpk in taxonpks]

        features = features.append(pd.Series([accepted_plant_name_id]+herbier_based_features([accepted_plant_name_id],taxonpks)),ignore_index=True)

    if not features.empty:
        features.columns = ['accepted_plant_name_id']+column_names+['herbier_n_'+str(year)+'_'+str(year+20) for year in list(date_hist_bins)[:-1]]
        features = features.astype({**{'herbier_n_'+foo:'Int64'  for foo in ['obs','taxon','secteurs','pays','obs_last_5y','obs_last_10y','obs_last_20y','obs_last_50y']+[str(year)+'_'+str(year+20) for year in list(date_hist_bins)[:-1]]},
                        **{foo:'Int64' for foo in ['herbier_max_date','herbier_min_date','herbier_delta_date','herbier_age_last_obs','herbier_age_first_obs']}})
        features.to_csv('/home/joon/data/herbier-based-features-'+genus+'.csv',header=False,index=False)

In [180]:
os.system('find /home/joon/data -type f -name \'herbier-based-features*.csv\' | xargs rm')

from multiprocessing import Pool
with Pool(processes=11) as pool:
    pool.map(herbier_based_features_process_genus, checklist_genus_values)

pd.DataFrame([['accepted_plant_name_id']+column_names+['herbier_n_'+str(year)+'_'+str(year+20) for year in list(date_hist_bins)[:-1]]]).to_csv('/home/joon/data/herbier-based-features_column-names.csv',header=None,index=None)

#fusionner les résultats et supprimer les fichiers temporaires!
os.system('find /home/joon/data -type f -name \'herbier-based-features-*.csv\' | xargs cat /home/joon/data/herbier-based-features_column-names.csv > /home/joon/data/herbier-based-features.csv')
os.system('find /home/joon/data -type f -name \'herbier-based-features-*.csv\' | xargs rm')

0

# Features for iucn_taxon_id

In [111]:
iucn_herbier_taxonpk_matching = pd.read_csv('/home/joon/data/iucn-herbier-taxonpk-matching.csv',index_col=0,header=None,squeeze=True,converters={1:read_set_in_csv},dtype={0:'int'})

In [99]:
iucn_checklist_matching = pd.read_csv('/home/joon/data/iucn-checklist-matching.csv',index_col=0,header=None,squeeze=True,converters={1:read_set_in_csv},dtype={0:'int'})

In [33]:
iucn_taxon_ids = iucn_checklist_matching.index.unique()

In [133]:
def herbier_based_features_for_iucn_process_split(split_nb):
    features = pd.DataFrame([])

    global splits

    split = splits[split_nb]

    for iucn_taxon_id in split:
        taxonpks = iucn_herbier_taxonpk_matching.loc[iucn_taxon_id]
        taxonpks = [int(taxonpk) for taxonpk in taxonpks]
        accepted_plant_name_ids = iucn_checklist_matching.loc[iucn_taxon_id]
        features_row = herbier_based_features(accepted_plant_name_ids,taxonpks)
        features = features.append(pd.Series([iucn_taxon_id]+features_row),ignore_index=True)


    if not features.empty:
        features.columns = ['iucn_taxon_id']+column_names+['herbier_n_'+str(year)+'_'+str(year+20) for year in list(date_hist_bins)[:-1]]
        features = features.astype({**{'herbier_n_'+foo:'Int64'  for foo in ['obs','taxon','secteurs','pays','obs_last_5y','obs_last_10y','obs_last_20y','obs_last_50y']+[str(year)+'_'+str(year+20) for year in list(date_hist_bins)[:-1]]},
                        **{foo:'Int64' for foo in ['herbier_max_date','herbier_min_date','herbier_delta_date','herbier_age_last_obs','herbier_age_first_obs']}})
        features.to_csv('/home/joon/data/herbier-based-features-for_iucn-split-'+str(split_nb)+'.csv',header=False,index=False)

In [136]:
os.system('find /home/joon/data -type f -name \'herbier-based-features-for_iucn*.csv\' | xargs rm')

nb_splits = 1000
splits = np.array_split(iucn_taxon_ids,nb_splits)

from multiprocessing import Pool
with Pool(processes=11) as pool:
    pool.map(herbier_based_features_for_iucn_process_split,range(len(splits)))

pd.DataFrame([['iucn_taxon_id']+column_names+['herbier_n_'+str(year)+'_'+str(year+20) for year in list(date_hist_bins)[:-1]]]).to_csv('/home/joon/data/herbier-based-features-for_iucn-column-names.csv',header=None,index=None)

os.system('find /home/joon/data -type f -name \'herbier-based-features-for_iucn-split*.csv\' | xargs cat /home/joon/data/herbier-based-features-for_iucn-column-names.csv > /home/joon/data/herbier-based-features-for_iucn.csv')
os.system('find /home/joon/data -type f -name \'herbier-based-features-for_iucn-*.csv\' | xargs rm')

0