## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt
import miceforest as mf

In [None]:
data_dir = '../data/'
field_data_file = data_dir + 'field_data.csv'
results_file = data_dir + 'field_data_imputed.csv'

## Helper functions

In [None]:
def compute_agb(row):
    diameter = row['diameter']
    species = row['group']
    if species == 'banana':
        agb = 0.03 * diameter ** 2.13
    elif species == 'cacao':
        agb = 0.1208 * diameter ** 1.98
    elif species == 'fruit':
        agb = 0.0776 * diameter ** 2.64
    elif species == 'citrus':
        agb = 0.0776 * diameter ** 2.64
    elif species == 'timber':
        agb = 21.3 - 6.95 * diameter + 0.74 * diameter ** 2
    else:
        agb = 0.1466 * diameter ** 2.223
    return agb

In [None]:
def compute_carbon(agb, rsr=0.22):
    return agb / (2 * (1 - rsr))  # equivalent to agb * 0.64

In [None]:
def make_categorical(df, column_name):
    df[column_name] = pd.Categorical(df[column_name]).cat.codes
    return df

In [None]:
def make_one_hot(df, column_name):
    categories = pd.get_dummies(df[column_name])
    categories.columns = [f'{column_name}_{category_name}' for category_name in categories.columns]
    df = pd.concat([df, categories], axis=1)
    return df

In [None]:
def load_data(file_name = 'field_data.csv', modify_columns=True, verbose=False):
    data_dir = '../data/'
    field_data_file = data_dir + file_name
    df = pd.read_csv(field_data_file, na_values='')
    df['diameter'] = df['diameter'].map(lambda x: float(x) if x != 0 else np.nan)
    if modify_columns:
        df = df.drop(columns=['lat', 'lon', 'site', 'X', 'Y', 'updated diameter', 'AGB', 'carbon'])
        df['year'] = df['year'].map(lambda x: int(x) - 2016)
        df['height'] = df['height'].map(lambda x: float(x))
        df['plot_id'] = df['plot_id'].map(lambda x: int(x[1:]))
        df = make_one_hot(df, 'name')
        df = make_one_hot(df, 'group')
        df = make_one_hot(df, 'plot_id')
        df = df.drop(columns=['name', 'group', 'plot_id'])
    if verbose:
        print(df)
    return df

In [None]:
def save_data(clean_df, verbose=False):
    df = pd.read_csv(field_data_file, na_values='')
    df = df.drop(columns=['updated diameter'])
    df['diameter'] = clean_df['diameter']
    df['AGB'] = df.apply(lambda row: compute_agb(row), axis=1)
    df['carbon'] = compute_carbon(df['AGB'])
    df.to_csv(results_file, index=False)
    if verbose:
        print(df)

## Explore missing values

In [None]:
df = pd.read_csv('../data/field_data.csv', na_values=0)
print(sum(df['height'] > 0))  # Number of height values in dataset
print((df['diameter'] > 0).sum())  # Number of diameter values in dataset

In [None]:
all_species_group_names = sorted(df.group.unique())
for species_group_name in all_species_group_names:
    print(species_group_name)
    print(f'{len(df[df.group == species_group_name]) / 4663 * 100}%')  # Percentage of trees in this group
    print(len(df[df.group == species_group_name]))  # Number of trees in this group
    print((df[df.group == species_group_name]['diameter'] > 0).sum())  # Number of trees in this group with diameter values
    print(len(df[(df['group'] == species_group_name) & (df['diameter'].isna())]))  # Same, but without diameter
    print('=' * 100)

In [None]:
all_species_names = sorted(df.name.unique())
for species_name in all_species_names:
    print(species_name)
    print(f'{len(df[df.name == species_name]) / 4663 * 100}%')  # Percentage of trees in this species
    print(len(df[df.name == species_name]))  # Number of trees in this species
    print((df[df.name == species_name]['diameter'] > 0).sum())  # Number of trees in this species with diameter values
    print(len(df[(df['name'] == species_name) & (df['diameter'].isna())]))  # Same, but without diameter
    print('=' * 100)

## Explore outliers

In [None]:
for column_name in ['diameter', 'AGB']:
    sorted_column = df[column_name].dropna().sort_values()
    print(f'{column_name[0].capitalize() + column_name[1:]} values:\n')
    print(f'min={sorted_column.min()}, max={sorted_column.max()}, mean={sorted_column.mean()}, std={sorted_column.std()}\n')
    print(sorted_column[:20].to_numpy(), '\n')  # Smallest 20 values
    print(sorted_column[-20:].to_numpy(), '\n')  # Largest 20 values
    print('=' * 100, '\n')

## Start data preprocessing

In [None]:
df = load_data()

## Imputation

In [None]:
def impute(df, method='simple', strategy='median', n_neighbors=10, weights='uniform', metric='nan_euclidean',
           save_all_iterations=False, iterations=100, n_estimators=100, verbose=False):
    if method == 'mice':
        imputer = mf.ImputationKernel(df, save_all_iterations=False)
        imputer.mice(iterations=100, n_estimators=100, n_jobs=-1)
        imputed_df = imputer.complete_data()
    else:
        if method == 'simple':
            imputer = SimpleImputer()
        elif method == 'knn':
            imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights, metric=metric)
        else:
            print(f'Unknown imputation method "{method}". Valid options: simple, knn, mice.')
            return df
        imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    if verbose:
        print(imputed_df)
    return imputed_df

In [None]:
imputed_df = impute(df)

## Outlier detection

In [None]:
def outlier_removal(df, method='simple', n_estimators=100, max_features=1.0, n_jobs=-1, verbose=False):
    if method == 'simple':
        clean_df = df[(df['diameter'] > 1.5) & (df['diameter'] < 30)]
    elif method == 'isolation_forests':
        isolation_forest = IsolationForest(n_estimators=n_estimators, max_features=max_features, n_jobs=n_jobs).fit(df)
        outliers_indices = isolation_forest.predict(df)
        clean_df = df[outliers_indices == 1]
    else:
        print(f'Unknown outlier detection method "{method}". Valid options: simple, isolation_forests.')
        return df
    if verbose:
        print(imputed_df)
    return clean_df

In [None]:
clean_df = outlier_removal(imputed_df)

## Save final data

In [None]:
save_data(clean_df)