In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import sys
import logging
from pprint import pprint

from sklearn.model_selection import KFold

sys.path.append(Path('.').absolute().parent.resolve().as_posix())
sys.path.append((Path('.').absolute().parent / 'source').resolve().as_posix())
from data_loader import config_loader, data_preprocessing
from logging_util.logger import get_logger
from models import estimators
from evaluation import evaluation

config = config_loader.load_config()
logger = get_logger(__name__)
logging.getLogger("PIL").setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
estimators

In [3]:
target = 'K2O_avg_app'
config.target = target

In [None]:
# set path to save results

resultspath = Path().resolve().parent.resolve() / 'results' / target
resultspath

In [None]:
# function to one hot encode the input data
def ohe(input_data, config):
    
    datapath = Path().resolve().parent.resolve() / 'data'
    dtype = data_preprocessing.get_data_types(config)
    all_data = pd.read_csv(f'{datapath}/{config.csv_files["features"]}', dtype=dtype, usecols=dtype.keys()) 

    input_data_ohe = data_preprocessing.one_hot_encoding(input_data)

    return input_data_ohe

f# function to load the input data
def load_input_data(resultspath: Path, it: int = 0, test: bool = False, config=config):
    dtype = data_preprocessing.get_data_types(config)
    X = pd.read_csv(resultspath / f'data/{"test" if test else "train"}_set_{it}.csv', index_col=0, dtype=dtype)
    return X

# function to make the predictions
def make_prediction(estimator, X, config):
    # drop the last column, which is the target
    if config.target in X.columns:
        X = X.drop(config.target, axis=1)
    X = ohe(X, config)
    y_pred = pd.Series(
            estimator.model.predict(X),
            index=X.index,
            name='predicted_' + config.target,
        )
    return y_pred

# make the predictions for all estimators
for estimator_name, estimator_cls in estimators.items():
    print(estimator_name)
    estimator = estimator_cls(outpath=resultspath)
    estimator = estimator.load(path=estimator.output_path, it=0)
    X = load_input_data(resultspath, it=0, test=True, config=config)    
    y_pred = make_prediction(estimator, X, config)

    # check if the predictions match
    print((y_pred != estimator.y_pred).any())



In [None]:
# make predictions for HGB
full_dataset = data_preprocessing.load_all_data(config=config)

for estimator_name, estimator_cls in estimators.items():
    if 'Hist' in estimator_name:
        print(estimator_name)
        estimator = estimator_cls(outpath=resultspath)
        estimator = estimator.load(path=estimator.output_path, it=0)  
        y_pred = make_prediction(estimator, full_dataset, config)

In [11]:
# add the predictions to the full dataset
# full_dataset = full_dataset.join(y_pred)
it = 0
full_dataset.to_csv(resultspath / f'{target}_{it}_predictions.csv')