# Imports

In [69]:
import pandas as pd
import numpy as np

def compute_average(y_true, y_pred, t):
    """Returns the average number of species observed correctly predicted given a threshold value t"""
    assert t <= 1
    assert t >= 0
    N, C = y_pred.shape
    temp = y_pred[y_true == 1].applymap(lambda x: 1 if x >= t else 0)
    average = temp.values.sum()/N
    return average

def find_t_min(y_true, y_pred, K, rate, t):
    """Returns the minimum threshold t and corresponding average satisfying the condition average <= K. The minimum t is found iteratively, with tuning parameter rate [0-1]"""
    assert rate <= 1
    assert rate >= 0
    assert K > 0
    average = compute_average(y_true, y_pred, t)
    while average <= K:
        t = rate*t
        average = compute_average(y_true, y_pred, t)
    t_min = t/rate
    average = compute_average(y_true, y_pred, t_min)
    return t_min, average

def compute_accuracy(y_true, y_pred, t_min):
    N, C = y_pred.shape
    temp = y_pred[y_true == 1].applymap(lambda x: 1 if x >= t_min else 0)
    return temp.values.sum()/(N*C)

def custom_metric(y_true, y_pred, K, rate, t):
    t_min, average = find_t_min(y_true, y_pred, K, rate, t)
    accuracy = compute_accuracy(y_true, y_pred, t_min)
    return t_min, average, accuracy



In [62]:
y_true = pd.DataFrame([[0, 0, 1],
                      [0, 1, 0],
                      [0, 1, 1],
                      [1, 1, 0],
                      [1, 1, 1]],
                      columns = ["tree", "grass", "flower"])

In [61]:
def dataframe_baseline_creator(df):

    new_df = df.copy()
    
    for column in new_df.columns:
        
        new_df[column] = df[column].sum()/df.sum().sum()
    
    return new_df
    

# Play

In [70]:
y_pred_dumb = dataframe_baseline_creator(y_true)

In [72]:
y_pred_dumb

Unnamed: 0,tree,grass,flower
0,0.222222,0.444444,0.333333
1,0.222222,0.444444,0.333333
2,0.222222,0.444444,0.333333
3,0.222222,0.444444,0.333333
4,0.222222,0.444444,0.333333


In [64]:
y_true

Unnamed: 0,tree,grass,flower
0,0,0,1
1,0,1,0
2,0,1,1
3,1,1,0
4,1,1,1


In [84]:
custom_metric(y_true = y_true, y_pred = y_pred_dumb, K = 1, rate = 0.98, t = 1)

(0.33589851774974244, 0.8, 0.26666666666666666)

In [109]:
sample_csv = pd.read_csv('../raw_data/occurences_1k_features.csv')


In [112]:
sample_csv.groupby('scientificName').count()

Unnamed: 0_level_0,gbifID,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
scientificName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acer campestre L.,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
Acer negundo L.,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Acer platanoides L.,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
Acer pseudoplatanus L.,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
Achillea filipendulina Lam.,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Viscum album L.,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Viscum album subsp. album,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
Vitis vinifera L.,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
Wisteria sinensis (Sims) DC.,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [104]:
sample_csv = sample_csv.drop(columns = 'scientificName')

In [108]:
sample_csv.groupby('gbifID').count()

Unnamed: 0_level_0,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,...,silt_15-30cm,silt_30-60cm,silt_5-15cm,silt_60-100cm,soc_0-5cm,soc_100-200cm,soc_15-30cm,soc_30-60cm,soc_5-15cm,soc_60-100cm
gbifID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
700408702,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
700518274,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
700552718,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
833553472,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
833558031,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3764514664,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3772571203,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3772882436,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3773504615,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
