In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import sys
import logging
import os
os.environ['TARGET'] = 'K2O_avg_app'

sys.path.append(Path('.').absolute().parent.resolve().as_posix())
sys.path.append((Path('.').absolute().parent / 'source').resolve().as_posix())
from data_loader import config_loader, data_preprocessing
from logging_util.logger import get_logger
from models import estimators
from evaluation import evaluation

config = config_loader.load_config(fertilizer=os.getenv('TARGET'))
logger = get_logger(__name__)
logging.getLogger("PIL").setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
# set resultspath

resultspath = Path().resolve().parent / 'results_corrected'
resultspath

In [3]:
# function to initialize estimator
def init_estimator(estimator_name: str = 'HistGradientBoostRegressor', resultspath: Path = resultspath, fold: int = 0):

    estimator_cls = estimators[estimator_name]
    estimator = estimator_cls(outpath=resultspath)
    estimator = estimator.load(path=estimator.output_path, it=fold)

    return estimator

# function to make predictions
def make_predictions(input_data: pd.DataFrame, target: str = 'N_avg_app', estimator_name: str = 'HistGradientBoostRegressor', resultspath: Path = resultspath, fold: int = 0) -> tuple:

    estimator = init_estimator(estimator_name, resultspath / target, fold)
    y_pred = pd.Series(
        estimator.model.predict(input_data),
        index=input_data.index,
        name=f'predicted_{target}',
    )
    dataset_with_preds = input_data.join(y_pred)
    dataset_with_preds.to_csv(resultspath / target / estimator.abbrev / f'full_predictions_{fold}.csv')

    return y_pred, dataset_with_preds

# all_data = data_preprocessing.load_all_data()
# all_data = data_preprocessing.one_hot_encoding(all_data)
# make_predictions(input_data=all_data, target=target, estimator_name=estimator_name, resultspath=resultspath, fold=0)

In [4]:
# make all predictions
all_data = data_preprocessing.load_all_data()
all_data = data_preprocessing.one_hot_encoding(all_data)
for estimator_name, _ in estimators.items():
    # for target in ['N_avg_app', 'P2O5_avg_app', 'K2O_avg_app']:
    for fold in [0, 1]:
        make_predictions(input_data=all_data, target=os.getenv('TARGET'), estimator_name=estimator_name, resultspath=resultspath, fold=fold)

In [None]:
def merge_folds(target: str = 'N_avg_app', estimator_abbrev: str = 'HGB', resultspath: Path = resultspath):
    """Merge the folds of a model into a single dataframe"""

    preds_0 = pd.read_csv(resultspath / f'{target}/{estimator_abbrev}/full_predictions_0.csv')
    preds_1 = pd.read_csv(resultspath / f'{target}/{estimator_abbrev}/full_predictions_1.csv')

    test_0 = pd.read_csv(resultspath / f'{target}/data/test_set_0.csv')
    test_1 = pd.read_csv(resultspath / f'{target}/data/test_set_1.csv')

    testpreds_0 = pd.read_csv(resultspath / f'{target}/{estimator_abbrev}/predictions_0.csv')
    testpreds_1 = pd.read_csv(resultspath / f'{target}/{estimator_abbrev}/predictions_1.csv')

    # get r2 values for each fold
    metrics_path = resultspath / target / 'performance_metrics_test.csv'
    metrics = pd.read_csv(metrics_path)
    metrics.columns = ['metric', 'estimator_abbrev', 'mean', 'std', '0', '1']
    r2_0 = metrics.loc[(metrics['metric'] == 'R2') & (metrics['estimator_abbrev'] == estimator_abbrev), '0'].to_numpy()[0]
    r2_1 = metrics.loc[(metrics['metric'] == 'R2') & (metrics['estimator_abbrev'] == estimator_abbrev), '1'].to_numpy()[0]

    # load original dataset
    dtypes = data_preprocessing.get_data_types(config=config)
    dtypes['N_avg_app'] = np.float64
    dtypes['P2O5_avg_app'] = np.float64
    dtypes['K2O_avg_app'] = np.float64
    all_data = data_preprocessing.load_all_data(config=config, dtype=dtypes)
    final_data = all_data.copy()

    # find indices where NOT (all three fertilizers are known and N_avg_app < 5000)
    idx_unlabeled_fertilizers = all_data.loc[~(
        ~all_data['N_avg_app'].isna() & 
        ~all_data['P2O5_avg_app'].isna() & 
        ~all_data['K2O_avg_app'].isna() & 
        (all_data['N_avg_app'] < 5000)
        )].index


    # merge predictions for samples not in test/train sets
    subset_0 = preds_0.loc[idx_unlabeled_fertilizers]
    subset_1 = preds_1.loc[idx_unlabeled_fertilizers]

    weight_0, weight_1 = r2_0 / (r2_0 + r2_1), r2_1 / (r2_0 + r2_1)
    weighted_preds = weight_0 * subset_0[f'predicted_{target}'] + weight_1 * subset_1[f'predicted_{target}']
    final_data.loc[idx_unlabeled_fertilizers, f'predicted_{target}'] = weighted_preds

    ## Test/Train predictions

    # add preds to test sets
    test_0_with_preds = test_0.copy()
    test_0_with_preds[f'predicted_{target}'] = testpreds_0[f'predicted_{target}']
    test_1_with_preds = test_1.copy()
    test_1_with_preds[f'predicted_{target}'] = testpreds_1[f'predicted_{target}']

    # match the indices of the test sets with the indices of the predictions

    # using columns 'FAOStat_area_code', 'Year', 'Crop_Code', and the target
    # we can match the indices of the test sets with the indices of the predictions

    final_data['temp'] = final_data['FAOStat_area_code'].astype(str) + final_data['Year'].astype(str) + final_data['Crop_Code'].astype(str) + final_data[target].astype(str)
    test_0_with_preds['temp'] = test_0_with_preds['FAOStat_area_code'].astype(str) + test_0_with_preds['Year'].astype(str) + test_0_with_preds['Crop_Code'].astype(str) + test_0_with_preds[target].astype(str)
    test_1_with_preds['temp'] = test_1_with_preds['FAOStat_area_code'].astype(str) + test_1_with_preds['Year'].astype(str) + test_1_with_preds['Crop_Code'].astype(str) + test_1_with_preds[target].astype(str)

    # match the indices of test_0_with_preds with the indices of final_data and fill in the predictions
    test_0_with_preds = test_0_with_preds.set_index('temp')
    test_1_with_preds = test_1_with_preds.set_index('temp')
    final_data = final_data.set_index('temp')
    final_data.loc[test_0_with_preds.index, f'predicted_{target}'] = test_0_with_preds[f'predicted_{target}']
    final_data.loc[test_1_with_preds.index, f'predicted_{target}'] = test_1_with_preds[f'predicted_{target}']
    final_data = final_data.reset_index()
    final_data.drop(columns=['temp', 'N_avg_app', 'P2O5_avg_app', 'K2O_avg_app'], inplace=True)

    # if any values in the final dataset are < 0, set them to 0
    final_data[f'predicted_{target}'] = final_data[f'predicted_{target}'].clip(lower=0)

    # sanity check: are all predictions filled in?
    if final_data[f'predicted_{target}'].isna().sum() > 0:
        print(f'WARNING: {final_data[f"predicted_{target}"].isna().sum()} predictions are missing for {target},{estimator_abbrev}')

    # save the merged predictions
    final_data.to_csv(resultspath / f'{target}/{estimator_abbrev}/full_predictions_merged.csv') 


In [None]:
for estimator_abbrev in ['HGB', 'XGB']:
    # for target in ['N_avg_app', 'P2O5_avg_app', 'K2O_avg_app']:
    merge_folds(target=os.getenv('TARGET'), estimator_abbrev=estimator_abbrev, resultspath=resultspath)