In [1]:
import pandas as pd
import logging
from pathlib import Path
import sys
import numpy as np
import matplotlib.pyplot as plt

sys.path.append(Path('.').absolute().parent.resolve().as_posix())
from data_loader import config_loader, data_preprocessing
from logging_util.logger import get_logger
from models import estimators

2024-12-19 15:10:38,995 - INFO - __init__.py:19 - Loading RandomForestRegressor


In [2]:
config = config_loader.load_config()
logger = get_logger(__name__)
logging.getLogger('shap').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [3]:
targets = ['DTIC_mass2'] 

# chosen model and path to output of target_prediction.ipynb
model_selection = {
    'DSIC_mass2': ['RF', 'Round_[4, 5, 6, 7, 8, 9, 10]_without_db_december2024'], 
    'DTIC_mass2': ['RF', 'december2024'], 
}

In [4]:
k = 10 # number of folds in outer loop

In [5]:
def get_dataset(target, inpath, i):
        df = pd.read_csv(
                inpath / 'kfoldcv' / 'data' / f'test_set_{i}.csv', 
                index_col=0
        )
        cols = [var for var in df.columns if var != target]

        df = data_preprocessing.DataSet(
            X_test = df[cols],
            X_train = pd.DataFrame(), 
            y_test = df[target], 
            y_train = pd.Series(dtype=float)
        )
        return df

In [6]:
for target in targets:
    print(target)
    for i in range(k):
        print('fold', i)
        
        # load data and process
        inpath = Path().resolve().parent / 'results' / f'target_{target}' / model_selection[target][1] 
        df_i = get_dataset(target, inpath, i)
        df_i.X_test = data_preprocessing.scaler([df_i.X_test])[0]

        # load model
        full_name = {'XGB': 'XgboostRegressor', 'RF':'RandomForestRegressor'}
        estimator_i = estimators[full_name[model_selection[target][0]]].load(
                inpath / 'kfoldcv' / model_selection[target][0], i
        )
        
        # calculate shap values and save in folder
        _ = (
            estimator_i.get_feature_importance(
                dataset_enc=df_i,
                it=i, 
                plot_=False, 
                shap_=True, 
                per_subset=True,
                output_path = inpath / 'kfoldcv' / model_selection[target][0]
            )
        )

DTIC_mass2
fold 0


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


  0%|          | 0/73 [00:00<?, ?it/s]

X does not have valid feature names, but RandomForestRegressor was fitted with feature names


  0%|          | 0/16 [00:00<?, ?it/s]

X does not have valid feature names, but RandomForestRegressor was fitted with feature names


  0%|          | 0/57 [00:00<?, ?it/s]

fold 1


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


  0%|          | 0/68 [00:00<?, ?it/s]

X does not have valid feature names, but RandomForestRegressor was fitted with feature names


  0%|          | 0/12 [00:00<?, ?it/s]

X does not have valid feature names, but RandomForestRegressor was fitted with feature names


  0%|          | 0/56 [00:00<?, ?it/s]

fold 2


X does not have valid feature names, but RandomForestRegressor was fitted with feature names


  0%|          | 0/71 [00:00<?, ?it/s]

KeyboardInterrupt: 