In [1]:
import warnings
warnings.filterwarnings('ignore')

## Regression-Continuous


In [2]:
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [3]:
df_full = pd.read_csv('../data/regression_lufu_2021-08-03.csv')

In [4]:
selected = [
    'id', 'gender', 'apgar_5min',
    'respiration_inv_days',
    'respiration_non_inv_days',
    'early_onset_infection', 'steroids', 'bpd_severity',
    'gest_age', 'birth_weight_g','body_size_cm',
    'volume', 'volume_ratio', 'oxygen_demand_days'
]

gender_dict = {'f':0, 'm':1}

df_regression=df_full[selected]
df_regression['resp_support_days']=df_regression['respiration_inv_days']+df_regression['respiration_non_inv_days']

df_regression.replace({"gender": gender_dict}, inplace=True)

In [5]:
# Create volume variable but normalized by the birth weight
df_regression['volume_per_kg'] = 1000*df_regression['volume']/df_regression['birth_weight_g']

In [6]:
def nested_cross_validation(X, y, depth_array, random_states, n_splits_outer=5, n_splits_inner=3):
    
    model_weights = []
    grid_best_scores = []
    grid_best_ps = []
    
    mse = []
    mae = []
    
    outer_fold = []
    repetition_n = []
    
    for j in range(len(random_states)):
        random_state_i = random_states[j]
        
        N=10

        p_grid = {
            "max_depth": depth_array,
                 }
        
        model_grid = RandomForestRegressor( 
            random_state=random_state_i
        )

        # OUTER FOLD
        outter_cv = StratifiedKFold(n_splits=n_splits_outer, shuffle=True, random_state=random_state_i).split(X, y)
        
        for i, (train, test) in enumerate(outter_cv):
            x_train = X.loc[train]
            y_train = y.loc[train]
            x_test = X.loc[test]
            y_test = y.loc[test]

            # Inner Fold => Find the best parameter
            inner_cv = StratifiedKFold(n_splits=n_splits_inner, shuffle=True, random_state=random_state_i)
            grid = GridSearchCV(estimator=model_grid, param_grid=p_grid, cv=inner_cv)
            grid.fit(x_train,y_train)
            grid_best_scores.append(grid.best_score_)
            grid_best_ps.append(grid.best_estimator_.max_depth)

            # Use the best parameter of the inner fold on the test
            model_rf = RandomForestRegressor(
                random_state=random_state_i, 
                max_depth=grid.best_estimator_.max_depth
            ).fit(x_train, y_train)

            model_weights.append(model_rf.feature_importances_)
            
            predictions = model_rf.predict(x_test)

            mse_temp = metrics.mean_squared_error(y_test, predictions)
            mae_temp = metrics.mean_absolute_error(y_test, predictions) # this

            mse.append(mse_temp)
            mae.append(mae_temp)
            
            outer_fold.append(i)
            repetition_n.append(j)
            
    df_results = pd.DataFrame.from_dict({
    'repetition_n':repetition_n,
    'outer_fold':outer_fold,
    'mse': mse,
    'mae': mae,
    'model_coefs': model_weights,
    })

    return df_results

# Nested Cross Validation for RF Regression

In [7]:
random_array = [1,50,234,3,5,789,45,658,9,429]
tree_depth = [3,4,5,6,7,8,9,10]

## Only Volume

In [8]:
explanatory = [
    'volume_per_kg', 'volume_ratio', 
]

y=df_regression['resp_support_days']
X=df_regression[explanatory]


df_all_vars_poisson = nested_cross_validation(X, y, depth_array=tree_depth, random_states=random_array)

print(df_all_vars_poisson['mae'].mean())
print(df_all_vars_poisson['mae'].std())

14.152260808934988
2.126290784233166


## Vol + Patient

In [9]:
explanatory = [
    'gender', 'gest_age', 'birth_weight_g','body_size_cm',
    'volume_per_kg', 'volume_ratio', 
]

y=df_regression['resp_support_days']
X=df_regression[explanatory]


df_all_vars_poisson = nested_cross_validation(X, y, depth_array=tree_depth, random_states=random_array)

print(df_all_vars_poisson['mae'].mean())
print(df_all_vars_poisson['mae'].std())

10.878333738742024
1.6100444930769828


## All vars

In [10]:
explanatory = [
    'gender', 'apgar_5min',
    'early_onset_infection', 'steroids',
    'gest_age', 'birth_weight_g','body_size_cm',
    'volume_per_kg', 'volume_ratio', 
]

y=df_regression['resp_support_days']
X=df_regression[explanatory]

df_all_vars_poisson = nested_cross_validation(X, y, depth_array=tree_depth, random_states=random_array)

print(df_all_vars_poisson['mae'].mean())
print(df_all_vars_poisson['mae'].std())

10.78533353234225
1.7574624256318352
