In [159]:
import pandas as pd

df_raw = pd.read_csv("data/raw_data/lichen_moss_coverage_data.csv")
df_processed = df_raw

In [160]:
# the "total coverage" is the total amount of lichen AND moss covering the
# surface of a rock
# the "moss fraction" is the fraction OF THE TOTAL that is moss.
# so it's a percentage of a percentage (we recoreded it that way because
# it was methodologically easier)
# example: 10% total cover, 40% moss cover
# 40% of 10% = 4%
# therefore there is 4% moss, and the 6% remaining is lichen
# the code below processes our values into simply "moss coverage" and
# "lichen coverage"

# the cover values are %, so divide by 100 to get the true value
total_cover = df_raw['total_cover'] / 100
moss_fraction = df_raw['moss_fraction'] / 100
moss_cover = total_cover * moss_fraction
lichen_cover = total_cover - moss_cover

df_processed['total_cover'] = total_cover
df_processed['moss_cover'] = moss_cover
df_processed['lichen_cover'] = lichen_cover


In [161]:
# convert "has lichen type" columns to bools
df_processed['has_crustose'] = df_raw['crustose'].astype(bool)
df_processed['has_foliose'] = df_raw['foliose'].astype(bool)
df_processed['has_fruticose'] = df_raw['fruticose'].astype(bool)

df_processed = df_processed.drop(['crustose', 'foliose', 'fruticose'], axis=1)

In [162]:
# convert the strings in "species" to lists of strings
species = pd.Series([species.strip().split(',') for species in df_raw['species']])
df_processed['species'] = species


In [163]:
# get a set with all the unique species we have observed
all_species = set()
for species_list in df_processed['species']:
    for unique_species in species_list:
        all_species.add(unique_species)
print(f'all species: {all_species}')

# use the set to make new columns for each observed species, "has_{species}"
for species in all_species:
    df_processed[f'has_{species}'] = False

# on each column determine if a unique species is present or not
for index, row in df_processed.iterrows():
    for unique_species in row['species']:
        df_processed.loc[index, f'has_{unique_species}'] = True

all species: {'W_f', 'GR_c', 'Bl_f', 'R_c', 'Bl_c', 'W_fr', 'O_c', 'W_c', 'Y_f', 'G_c'}


In [164]:
# drop redundant columns
df_processed = df_processed.drop(columns=['moss_fraction'])

In [166]:
df_processed.to_csv('data/processed_data/coverage_data_processed.csv')