In [1]:
import pandas as pd
import numpy as np
import re
import time

In [119]:
print("Loading Observation Dataframe...")
alpha = time.time()

df_iawa = pd.read_excel("InsideWood_Africa_1-06-2023.xlsx", engine="openpyxl", header=0, na_filter=False).astype("str")
df_cepam = pd.read_excel("CEPAM_feature_net.xlsx", engine="openpyxl", sheet_name="CEPAM", header=2, na_filter=False).astype("str")

print(f"Dataframes loading time : {time.time() - alpha} seconds")

Loading Observation Dataframe...
Dataframes loading time : 1.0790698528289795 seconds


In [4]:
def manual_read_csv(filepath, enc, sep):
    with open(filepath, "r", encoding=enc) as csv_file:
        i = 0
        size = 0
        col_name = []
        data = []
        for line in csv_file:
            line = line.replace("\n", "")
            if i == 0:
                i += 1
                col_name = line.split(sep)
                size = len(col_name)
                continue
            line_spread = line.split(sep)
            if size < len(line_spread):
                line_spread
                raise("This line is not suitable")
            data.append(line_spread)
            i += 1
    return pd.DataFrame(data, columns=col_name)

In [121]:
# POWO generation for graph
print("Loading POWO Dataframe...")
alpha = time.time()

df_powo_taxa = pd.read_csv("recent_db/wcvp_taxon.csv", header=0, sep="|", encoding="UTF-8").astype("str")
df_powo_name = manual_read_csv("recent_db/Only/wcvp_names.csv","UTF-8", "|").astype("str")
df_dist = pd.read_csv("recent_db/Only/wcvp_distribution.csv", header=0, sep="|", encoding="UTF-8").astype("str")

old_df_powo = pd.read_csv("special_issue/wcvp_names.csv", header=0, sep="|", encoding="UTF-8").astype("str")
old_df_dist = pd.read_csv("special_issue/wcvp_distribution.csv", header=0, sep="|", encoding="UTF-8").astype("str")

print(f"Dataframes loading time : {time.time() - alpha} seconds")

Loading POWO Dataframe...
Dataframes loading time : 48.67000365257263 seconds


In [27]:
reduced_data = []
ignore_data = []
reduced_label = ["plant_name_id", "parent_id", "accepted_plant_name_id", "family", "genus", "taxon_name", "taxon_status", "ipni_id"]
filter_list = ["Accepted", "Synonym", "Unplaced"]
filter_powo = df_powo_name[df_powo_name["taxon_status"].isin(filter_list)]
    

In [28]:
iawa_label = ["iawa_family", "iawa_taxon", "iawa_usual_name", "accepted_id", "powo_family", "powo_genus", "powo_taxon", "powo_authors",
         "zone"]

errors_label = ["taxon", "iawa_line", "status"]
errors = []
BUG = None

In [29]:
def extract_iawa_information(row):
    family = ""
    original_string = df_iawa["Taxa"][row].replace("?", "")
    #print(original_string)
    for x in original_string.split("|"):
        x = x.replace("Synonym:", "")
        family = re.search("[A-Z][A-Z]+\s[A-Z][A-Z]+|[A-Z][A-Z]+", x).group(0)
        genre = re.search(
            "[A-Z][a-z]+\s(spp\.|sp\.|SPP\.|SP\.|sect\.)|[A-Z][a-z]+\s[a-z]+\s(subsp\.|var\.)\s[a-z]+|[A-Z][a-z]+\s(aff\.|cf\.)\s[a-z\-]+|[A-Z][a-z]+\s[a-z\-]+|[A-Z][a-z]+\.*", x).group(0)
        if re.search("\([A-Z][A-Z,\s]+\)", x):
            usual_name = re.search("\([A-Z][A-Z,\s]+\)", x).group(0)
        else:
            usual_name = ""
        return family, genre, usual_name, original_string

In [30]:
def get_plant_by_id(plant_id, df, plant):
    p1 = df[df["plant_name_id"].values == plant_id]
    
    if p1.empty:
        raise Exception(f"{plant}\nNo match id found for {plant_id} value type {type(plant_id)} test: {np.nan == plant_id}...")
    return p1['plant_name_id'].values[0], p1['family'].values[0], p1['genus'].values[0], p1['taxon_name'].values[0], p1['taxon_authors'].values[0]

In [59]:
def get_region_by_id(plant_id, df):
    r1 = df[df["plant_name_id"] == plant_id]
    return r1["continent"].values, r1["region"].values, r1["area"].values

In [152]:
def get_id_by_taxon2(taxon, df, org_str):
    matches = [" sp.", " spp.", " SPP.", " SP.", " group", "subgrp", "sect.", " SPP", "SP", " spp",]
    taxon = taxon.replace(" cf. ", " ").replace(" aff. ", " ").strip()
    category = "taxon_name"
    if any([x in taxon for x in matches]):
        for x in matches:
            taxon = taxon.replace(x, "").strip()
    
    reduced_taxon_df = df[df["taxon_name"] == taxon].sort_values(by=['taxon_status'])
    if len(reduced_taxon_df) > 1:
        waiting = []
        synonyms = []
        for row in reduced_taxon_df.index:
            
            status = reduced_taxon_df["taxon_status"][row]
            pl_id = reduced_taxon_df["plant_name_id"][row]
            pl_name = reduced_taxon_df["taxon_name"][row]
            pl_accepted = reduced_taxon_df["accepted_plant_name_id"][row]
            
            if status == "Accepted":
                location = df_dist[df_dist["plant_name_id"] == pl_id]["continent"].values.tolist()
                if "AFRICA" in location:
                    return pl_id
            
            if status == "Synonym":
                parent_status = df[df["plant_name_id"] == pl_accepted]["taxon_status"].values
                parent_id = pl_accepted
                parent_name = df[df["plant_name_id"] == pl_accepted]["taxon_name"].values
                location = df_dist[df_dist["plant_locality_id"] == parent_id]["continent"].values.tolist()
                if parent_status == "Accepted" and "AFRICA" in location:
                    synonyms.append(pl_id)
            
            if status not in ["Accepted", "Synonym"]:
                parent_status = df[df["plant_name_id"] == pl_accepted]["taxon_status"].values
                parent_id = pl_accepted
                parent_name = df[df["plant_name_id"] == pl_accepted]["taxon_name"].values
                location = df_dist[df_dist["plant_locality_id"] == parent_id]["continent"].values.tolist()
                if parent_status == "Accepted" and "AFRICA" in location:
                    waiting.append(parent_id)
        
        if len(synonyms) >= 1:
            return synonyms[0]
        
        if len(waiting) > 1:
            print(waiting)
        if waiting == []:
            print(taxon)
            errors.append([taxon, org_str, "Pas en Afrique"])
            return None
        return waiting[0]
    
    accepted_id_search = df[df[category] == taxon]['accepted_plant_name_id'].values
    plant_id = df[df[category] == taxon]['plant_name_id'].values
    if accepted_id_search.size == 0 and plant_id.size == 0:
        errors.append([taxon, org_str, "Pas trouvé dans la base"])
        return None
    
    status = df[df[category] == taxon]['taxon_status'].values[0]
    if not accepted_id_search and status != "Accepted":
        print(f"No accepted id for {taxon}")
        return plant_id

    status = df[df["plant_name_id"].values == accepted_id_search]["plant_name_id"].tolist()[0]
    
    while len(status) > 0 and status != "Accepted":
        if status != "Accepted":
            print(f"{accepted_id_search} status : {status}")
        accepted_id_search = df[df["plant_name_id"].values == accepted_id_search]['accepted_plant_name_id'].tolist()[0]
        status = df[df["plant_name_id"].values == accepted_id_search]["taxon_status"].tolist()[0]

    return accepted_id_search

In [153]:
# Execution
errors = []
extract_family = []
extract_genre = []
extract_usual_name = []
new_iawa = []

ignore_features = [
    "52 - <= 350 Âµm",
    "53 - 350 - 800 Âµm",
    "54 - >= 800 Âµm",
    "59MH - Wood vesselless",
    "60 - Vascular / vasicentric tracheids present",
    "67MH - Parenchyma-like fibre bands alternating with ordinary fibres",
    "71MH - <= 900 Âµm",
    "72MH - 900-1600 Âµm",
    "73MH - >= 1600 Âµm",
    "95MH - Unlignified parenchyma"
                  ]

In [154]:
def get_id_by_taxon(taxon, df, org_str):
    matches = [" sp.", " spp.", " SPP.", " SP.", " group", "subgrp", "sect.", " SPP", "SP", " spp",]
    taxon = taxon.replace(" cf. ", " ").replace(" aff. ", " ").strip()
    category = "taxon_name"
    if any([x in taxon for x in matches]):
        for x in matches:
            taxon = taxon.replace(x, "").strip()
    
    reduced_df = df[df["taxon_name"] == taxon]
    if reduced_df.empty:
        print([taxon, org_str, "Not found in database"])
        errors.append([taxon, org_str, "Not found in database"])
    for row in reduced_df.index:
        if df[df["plant_name_id"] == df["accepted_plant_name_id"][row]].index.size == 1:
            status = df[df["plant_name_id"] == df["accepted_plant_name_id"][row]]["taxon_status"].values[0]
            accepted_id = df[df["plant_name_id"] == df["accepted_plant_name_id"][row]]["plant_name_id"].values[0]
            dist_found = df_dist[df_dist["plant_name_id"] == accepted_id]
            if not dist_found[dist_found["continent"] == "AFRICA"].empty:
                return accepted_id
    print([taxon, org_str, "Taxa not in AFRICA"])
    errors.append([taxon, org_str, "Taxa not in AFRICA"])

In [155]:
alpha = time.time()
print("Starting IAWA extraction")
iawa_label = ["InsideWood_line", "accepted_id", "powo_family", "powo_genus", "powo_taxon", "powo_authors",
         "zone"]

for column_label in df_iawa.columns[1:]:
    if column_label == "163MH - Vitreous silica":
            break
    if column_label not in ignore_features:
        iawa_label.append(column_label)

for row in df_iawa.index:
    family, genre, usual_name, original_string = extract_iawa_information(row)
    accepted_id = get_id_by_taxon(genre, df_powo_name, original_string)
    if accepted_id:
        plant_id, accepted_family, accepted_genus, accepted_taxon, accepted_authors = get_plant_by_id(accepted_id, df_powo_name, original_string)
        continent, region, area = get_region_by_id(accepted_id, df_dist)
    else:
        plant_id, accepted_family, accepted_genus, accepted_taxon, accepted_authors = "", family, genre.split(" ")[0], genre.split(" ")[1], ""
    
    data = []
    for column_label in list(df_iawa.columns[1:]):
        if column_label == "163MH - Vitreous silica":
            break
        if column_label not in ignore_features:
            v = df_iawa[column_label][row]
            if isinstance(v, int):
                data.append("3")
            elif "v" in v:
                data.append("2")
            elif "?" in v:
                data.append("1")
            else:
                data.append("0")
            
    
    zone = []
    for x,y,z in zip(continent, region, area):
        zone.append(f"({x},{y},{z})")
    temp = [df_iawa["Taxa"][row],
                    plant_id, accepted_family, accepted_genus, accepted_taxon, accepted_authors,
                    "".join(zone)]
    temp.extend(data)
    new_iawa.append(temp)

print(f"Extraction done in {time.time() - alpha} seconds...")
pd.DataFrame(errors, columns=errors_label).to_csv("errors_iawa.csv", index=False, encoding="utf-8")
pd.DataFrame(new_iawa, columns=iawa_label).to_csv("generated_iawa.csv", index=False, encoding="utf-8", sep=";")

#print(to_extract)

Starting IAWA extraction
['Lindackeria laurina', 'ACHARIACEAE Lindackeria laurina C. Presl', 'Taxa not in AFRICA']
['Anaxagorea inundata', 'ANNONACEAE Anaxagorea inundata P. E. Berry & R. B. Miller', 'Taxa not in AFRICA']
['Stephanotis abyssinica', 'APOCYNACEAE Stephanotis abyssinica', 'Not found in database']
['Stephanotis abyssinica', 'APOCYNACEAE Stephanotis abyssinica', 'Taxa not in AFRICA']
['Schefflera', 'ARALIACEAE Schefflera group D2', 'Taxa not in AFRICA']
['Dacryodes', 'BURSERACEAE Dacryodes SPP. (KEDONDONG)', 'Taxa not in AFRICA']
['Celtis philippensis', 'CANNABACEAE Celtis philippensis Blanco (HARD CELTIS)', 'Taxa not in AFRICA']
['Celtis philippensis', 'CANNABACEAE Celtis philippensis Blanco (HARD CELTIS)', 'Taxa not in AFRICA']
['Erythrostelechia acuminata', 'CELASTRACEAE Erythrostelechia acuminata Capuron', 'Not found in database']
['Erythrostelechia acuminata', 'CELASTRACEAE Erythrostelechia acuminata Capuron', 'Taxa not in AFRICA']
['Atuna excelsa', 'CHRYSOBALANACEAE A

In [156]:
pd.DataFrame(errors, columns=errors_label).to_csv("errors_iawa.csv", index=False, encoding="utf-8")
pd.DataFrame(new_iawa, columns=iawa_label).to_csv("generated_iawa.csv", index=False, encoding="utf-8", sep=";")

In [36]:
ignore_features = [
    "52 - <= 350 Âµm",
    "53 - 350 - 800 Âµm",
    "54 - >= 800 Âµm",
    "59MH - Wood vesselless",
    "60 - Vascular / vasicentric tracheids present",
    "67MH - Parenchyma-like fibre bands alternating with ordinary fibres",
    "71MH - <= 900 Âµm",
    "72MH - 900-1600 Âµm",
    "73MH - >= 1600 Âµm",
    "95MH - Unlignified parenchyma"
                  ]

df_cepam = pd.read_excel("generated_cepam_5-07-2023.xlsx", header=0,na_filter=False)

cepam_label = ["id_sample", "cepam_family", "cepam_genus", "cepam_taxon",
               "accepted_id", "powo_family", "powo_genus", "powo_taxon", "powo_authors",
         "zone"]

for column_label in df_cepam.columns[10:]:
    if column_label == "163MH - Vitreous silica":
            break
    if column_label not in ignore_features:
        cepam_label.append(column_label)
        
new_cepam = []
errors = []

alpha = time.time()
print("Starting CEPAM extraction")
for row in df_cepam.index:
    id_sample, family, genus, taxon = df_cepam["id_sample"][row], df_cepam["cepam_family"][row], df_cepam["cepam_genus"][row], df_cepam["cepam_taxon"][row]
    taxon = taxon.replace(" cf ", " cf. ")
    accepted_id = get_id_by_taxon(taxon, df_powo_name, f"{family} {genus} {taxon} {id_sample}")
    if not accepted_id:
        print(f" {id_sample} => {family} {genus} {taxon}")
        raise("Error, taxon not found in POWO")
    plant_id, accepted_family, accepted_genus, accepted_taxon, accepted_authors = get_plant_by_id(accepted_id, df_powo_name, f"{family} {genus} {taxon} {id_sample}")
    continent, region, area = get_region_by_id(accepted_id, df_dist)
    
    data = []
    for column_label in list(df_cepam.columns[10:]):
        if column_label == "163MH - Vitreous silica":
            break
        if column_label not in ignore_features:
            v = df_cepam[column_label][row]
            if isinstance(v, int):
                data.append("3")
            elif "v" in v:
                data.append("2")
            elif "?" in v:
                data.append("1")
            else:
                data.append("0")
                
    zone = []
    for x,y,z in zip(continent, region, area):
        zone.append(f"({x},{y},{z})")
    temp = [id_sample, family, genus, taxon,
                    plant_id, accepted_family, accepted_genus, accepted_taxon, accepted_authors,
                    "".join(zone)]
    temp.extend(data)
    new_cepam.append(temp)

print(f"Extraction done in {time.time() - alpha} seconds...")
pd.DataFrame(errors, columns=errors_label).to_csv("errors_cepam.csv", index=False, encoding="utf-8")
pd.DataFrame(new_cepam, columns=cepam_label).to_csv("generated_cepam.csv", index=False, encoding="utf-8", sep=";")

Starting CEPAM extraction
Extraction done in 52.531033992767334 seconds...


In [38]:
pd.DataFrame(errors, columns=errors_label).to_csv("errors_cepam.csv", index=False, encoding="utf-8")
pd.DataFrame(new_cepam, columns=cepam_label).to_csv("generated_cepam.csv", index=False, encoding="utf-8", sep=";")

In [10]:
genus = re.search(
    "[A-Z][a-z]+\s(spp\.|sp\.)|[A-Z][a-z]+\s[a-z]+\s(subsp\.|cf\.|var\.)+\s[a-z]+|[A-Z][a-z]+\s[a-z\-]+|[A-Z][a-z]+",
          "LECYTHIDACEAE Oubanguia sp.").group(0)
print(genus)

Oubanguia sp.


In [11]:
df_cepam = pd.read_excel("CEPAM_feature_net.xlsx", sheet_name="CEPAM", header=2,na_filter=False)

In [12]:
experiment = list(df_cepam.columns)[4:]
for x in experiment:
    print(x)
    if len(x.split("-", 1)) > 1:
        number, name = x.split("-", 1)
    else:
        number, name = x.split(chr(8211),1)
    print(f"{number} : named {name}")
    

1 - Growth ring boundaries distinct
1  : named  Growth ring boundaries distinct
2 - Growth ring boundaries indistinct or absent
2  : named  Growth ring boundaries indistinct or absent
3 - Wood ring-porous
3  : named  Wood ring-porous
4 - Wood semi-ring-porous
4  : named  Wood semi-ring-porous
5 - Wood diffuse-porous
5  : named  Wood diffuse-porous
6 - Vessels in tangential bands
6  : named  Vessels in tangential bands
7 - Vessels in diagonal and / or radial pattern
7  : named  Vessels in diagonal and / or radial pattern
8 - Vessels in dendritic pattern
8  : named  Vessels in dendritic pattern
9 - Vessels exclusively solitary (90% or more)
9  : named  Vessels exclusively solitary (90% or more)
10 - Vessels in radial multiples of 4 or more common
10  : named  Vessels in radial multiples of 4 or more common
11 - Vessel clusters common
11  : named  Vessel clusters common
12 - Solitary vessel outline angular
12  : named  Solitary vessel outline angular
13 - Simple perforation plates
13  : n

In [13]:
"27 – Large ≥ 10 µm".split("-")
print(ord("-"))
print(ord("27 – Large ≥ 10 µm"[3]))

45
8211


In [14]:
list(df_iawa.columns[1:])

['1 - Growth ring boundaries distinct',
 '2 - Growth ring boundaries indistinct or absent',
 '3 - Wood ring-porous',
 '4 - Wood semi-ring-porous',
 '5 - Wood diffuse-porous',
 '6 - Vessels in tangential bands',
 '7 - Vessels in diagonal and / or radial pattern',
 '8 - Vessels in dendritic pattern',
 '9 - Vessels exclusively solitary (90% or more)',
 '10 - Vessels in radial multiples of 4 or more common',
 '11 - Vessel clusters common',
 '12MH - Solitary vessel outline angular',
 '13 - Simple perforation plates',
 '14 - Scalariform perforation plates',
 '15 - Scalariform perforation plates with <= 10 bars',
 '16 - Scalariform perforation plates with 10 - 20 bars',
 '17 - Scalariform perforation plates with 20 - 40 bars',
 '18 - Scalariform perforation plates with >= 40 bars',
 '19MH - Reticulate, foraminate, and / or other types of multiple perforation plates',
 '20 - Intervessel pits scalariform',
 '21 - Intervessel pits opposite',
 '22 - Intervessel pits alternate',
 '23MH - Shape of 

In [15]:
vector = []
for row in df_iawa.index:
    data = []
    for column in list(df_iawa.columns[1:]):
        data.append(df_iawa[column][row])
    vector.append(data)

In [52]:
iawa_feature_df = pd.read_csv("hardwood_features.csv", encoding="UTF-8", sep=";")
iawa_feature_df["hardwood_number"] = iawa_feature_df["hardwood_number"].fillna(-1)
iawa_feature_df["softwood_number"] = iawa_feature_df["softwood_number"].fillna(-1)
iawa_feature_df["bart_number"] = iawa_feature_df["bart_number"].fillna(-1)
iawa_feature_df["feature_group"] = iawa_feature_df["feature_group"].fillna(-1)
iawa_feature_df["feature_subgroup"] = iawa_feature_df["feature_subgroup"].fillna(-1)
iawa_feature_df = iawa_feature_df.astype({'hardwood_number': 'int32', 'softwood_number':'int32', 'bart_number':'int32'})

In [80]:
equivalence = dict()
direct_parent = dict()
data = []
label = ["feature_id", "parent_id", "feature_label", "definition", "author", "collection", "number_collection"]
for row in iawa_feature_df.index:
    equivalence[iawa_feature_df["feature_label"][row]] = iawa_feature_df["feature_id"][row]

for row in iawa_feature_df.index:
    group, subgroup = "", ""
    
    if iawa_feature_df["feature_group"][row] != -1:
        group = equivalence[iawa_feature_df["feature_group"][row]]
        
    if iawa_feature_df["feature_subgroup"][row] != -1:
        subgroup = equivalence[iawa_feature_df["feature_subgroup"][row]]

    
    feature_id = iawa_feature_df["feature_id"][row]
    author = iawa_feature_df["author"][row]
    
    if not feature_id in direct_parent.keys():
        direct_parent[feature_id] = subgroup if subgroup != "" else group if group != "" else ""
    if not subgroup in direct_parent.keys():
        direct_parent[subgroup] = group if subgroup != "" and group != "" else ""
    direct_parent[group] = ""
    
    parent_id = direct_parent[iawa_feature_df["feature_id"][row]]
    if not iawa_feature_df["hardwood_number"][row] == -1:
        feature_number = iawa_feature_df["hardwood_number"][row]
        collection = "Hardwood"
        data.append([feature_id, parent_id, iawa_feature_df["feature_label"][row], iawa_feature_df["definition"][row], iawa_feature_df["author"][row], collection, feature_number])
    if not iawa_feature_df["softwood_number"][row] == -1:
        feature_number = iawa_feature_df["softwood_number"][row]
        collection = "Softwood"
        data.append([feature_id, parent_id, iawa_feature_df["feature_label"][row], iawa_feature_df["definition"][row], iawa_feature_df["author"][row],collection, feature_number])
    if not iawa_feature_df["bart_number"][row] == -1:
        feature_number = iawa_feature_df["bart_number"][row]
        collection = "Bark"
        data.append([feature_id, parent_id, iawa_feature_df["feature_label"][row], iawa_feature_df["definition"][row], iawa_feature_df["author"][row],collection, feature_number])

    if iawa_feature_df["feature_group"][row] == -1 and iawa_feature_df["feature_subgroup"][row] == -1:
        feature_number = ""
        collection = ""
        data.append([feature_id, parent_id, iawa_feature_df["feature_label"][row], iawa_feature_df["definition"][row], iawa_feature_df["author"][row],collection, feature_number])
    
result_df = pd.DataFrame(data, columns=label)
result_df.to_csv("feature_list.csv", index=False, encoding="utf-8", sep=";")

In [118]:
df_iawa = pd.read_csv("generated_iawa.csv", sep=";", header=0, encoding="utf-8")
df_cepam = pd.read_csv("generated_cepam.csv", sep=";", header=0, encoding="utf-8")

In [125]:
data = []
label = ["sample_id", "taxon_id", "collection", "id_feature", "feature_value", "FOI", "original_data"]
feature_value = {3: "IFV-2", 2: "IFV-3", 1 : "IFV-4", 0: ""}
sample_id = 0
for row in df_iawa.index:
    sample_id += 1
    taxon_id = df_iawa["accepted_id"][row]
    collection = "Inside Wood"
    original_value = df_iawa["InsideWood_line"][row]
    
    for col_name in df_iawa.columns[7:]:
        feature_number = col_name.replace("MH -", " -").replace("Â", "").split("-")[0]
        if df_iawa[col_name][row] == 0:
            continue
        feature_id = iawa_feature_df[iawa_feature_df["hardwood_number"] == int(feature_number)]["feature_id"].values[0]
        data.append([f"IW-{sample_id}", taxon_id, "Inside Wood", feature_id, feature_value[df_iawa[col_name][row]],direct_parent[direct_parent[feature_id]], df_iawa["InsideWood_line"][row]])

generated_df = pd.DataFrame(data, columns=label)
generated_df.to_csv("insidewood.csv", encoding="utf-8", index=False, sep=";")
    

In [126]:
data = []
label = ["sample_id", "collected_at","taxon_id", "collection", "id_feature", "feature_value", "FOI", "original_data"]
feature_value = {3: "IFV-2", 2: "IFV-3", 1 : "IFV-4", 0: ""}
for row in df_cepam.index:
    taxon_id = df_cepam["accepted_id"][row]
    collection = "Inside Wood"
    
    for col_name in df_cepam.columns[11:]:
        feature_number = col_name.replace("MH -", " -").replace("Â", "").split("-")[0]
        if df_cepam[col_name][row] == 0:
            continue
        feature_id = iawa_feature_df[iawa_feature_df["hardwood_number"] == int(feature_number)]["feature_id"].values[0]
        original_str = f'{df_cepam["id_sample"][row]} - {df_cepam["cepam_family"][row]} { df_cepam["cepam_genus"][row]} {df_cepam["cepam_taxon"][row]}'
        data.append([df_cepam["id_sample"][row], df_cepam["collectedAt"][row], taxon_id, "SACHA", feature_id, feature_value[df_cepam[col_name][row]],direct_parent[direct_parent[feature_id]], original_str])

generated_df = pd.DataFrame(data, columns=label)
generated_df.to_csv("sacha.csv", encoding="utf-8", index=False, sep=";")