In [1]:
import numpy as np
import pandas as pd
import sys, os, base64, io, time
import scipy.sparse as ss
import functools
import string

from sklearn.model_selection import train_test_split as sklearn_train_test_split
from sklearn.datasets import make_classification as sklearn_make_classification
from sklearn.model_selection import GridSearchCV as sklearn_GridSearchCV
from sklearn.linear_model import LogisticRegression as sklearn_Logistic

import dask.array as da
import dask.dataframe as ddf
from dask_ml.model_selection import train_test_split as dask_train_test_split
from dask_ml.datasets import make_classification as dask_make_classification
from dask_ml.model_selection import GridSearchCV as dask_GridSearchCV
from dask_ml.linear_model import LogisticRegression as dask_Logistic

from GridSearchThresholdCV import GridSearchThresholdCV as GSTCV

In a future release, Dask DataFrame will use new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 

  import dask.dataframe as ddf


In [2]:
# X, y
# score(X, y)


# X
# decision_function(X)
# inverse_transform(Xt)
# predict(X)
# predict_log_proba(X)
# predict_proba(X)
# score_samples(X)
# transform(X)

In [3]:
_rows, _cols = 1000, 10
_bad_cols = 2*_cols
GOOD_COLUMNS = list(string.ascii_lowercase[:_cols])
BAD_COLUMNS = list(string.ascii_lowercase[:_bad_cols])


good_np_X, good_np_y = sklearn_make_classification(n_samples=_rows, n_features=_cols, n_informative=_cols, n_redundant=0)
good_da_X, good_da_y = da.array(good_np_X), da.array(good_np_y)

good_pd_X, good_pd_y = pd.DataFrame(good_np_X, columns=GOOD_COLUMNS), pd.DataFrame(good_np_y, columns=['y'])
good_ddf_X, good_ddf_y = ddf.from_pandas(good_pd_X, chunksize=_rows//10), ddf.from_pandas(good_pd_y, chunksize=_rows//10)


bad_np_X, bad_features_np_y = sklearn_make_classification(n_samples=_rows, n_features=_bad_cols, n_informative=_bad_cols, n_redundant=0)
bad_features_np_y = np.vstack((bad_features_np_y, bad_features_np_y)).reshape((-1,2))
bad_da_X, bad_features_da_y = da.array(bad_np_X), da.array(bad_features_np_y)

bad_pd_X, bad_features_pd_y = pd.DataFrame(bad_np_X, columns=BAD_COLUMNS), pd.DataFrame(bad_features_np_y, columns=['y1','y2'])
bad_ddf_X, bad_features_ddf_y = ddf.from_pandas(bad_pd_X, chunksize=_rows//10), ddf.from_pandas(bad_features_pd_y, chunksize=_rows//10)


bad_classes_np_y = np.random.randint(0,3,_rows)
bad_classes_da_y = da.array(bad_classes_np_y)
bad_classes_pd_y = pd.DataFrame(bad_classes_np_y, columns=['y'])
bad_classes_ddf_y = ddf.from_pandas(bad_classes_pd_y, chunksize=_rows//10)

In [4]:
GOOD_OR_BAD_X = ['good_X', 'bad_X']
GOOD_OR_BAD_y = ['good_y', 'bad_features_y', 'bad_classes_y']
DTYPES = ['array', 'dataframe']
TYPES = ['sklearn', 'dask', 'gstcv_sklearn', 'gstcv_dask']

In [5]:
METHOD_NAMES = [
                'n_features_in_',
                'feature_names_in_',
                'classes_',

    
                'decision_function',
                'inverse_transform',
                'predict',
                'predict_log_proba',
                'predict_proba',
                'score_samples',
                'transform',

                'score'
]

In [6]:
__scoring = ['balanced_accuracy', 'accuracy']     # 'balanced_accuracy'
__refit = 'balanced_accuracy'


SklearnLogistic = sklearn_Logistic(    
                                    penalty='l2',
                                    dual=False,
                                    tol=1e-6,
                                    # C=1.0,
                                    fit_intercept=False,
                                    intercept_scaling=1,
                                    class_weight=None,
                                    random_state=None,
                                    solver='lbfgs',
                                    max_iter=10000,
                                    multi_class='auto',
                                    verbose=0,
                                    warm_start=False,
                                    n_jobs=-1,
                                    l1_ratio=None,
)
    

DaskLogistic = dask_Logistic(
                                    penalty='l2',
                                    dual=False,
                                    tol=1e-6,
                                    # C=1.0,
                                    fit_intercept=False,
                                    intercept_scaling=1.0,
                                    class_weight=None,
                                    random_state=None,
                                    solver='lbfgs',
                                    max_iter=10000,
                                    multi_class='ovr',
                                    verbose=0,
                                    warm_start=False,
                                    n_jobs=-1,
                                    solver_kwargs=None,
)

def init_gscv(_sk_est, _da_est, _type):
    
    _param_grid = {'C': np.logspace(-3,3,7)}

    if _type == 'sklearn':
        _gscv = sklearn_GridSearchCV(
                                      estimator=_sk_est,
                                      param_grid=_param_grid,
                                      scoring=__scoring,
                                      refit=__refit,
                                      cv=5,
                                      error_score=np.nan,
                                      return_train_score=True,
                                      n_jobs=-1
        )
    
    elif _type == 'dask':
        _gscv = dask_GridSearchCV(
                                      estimator=_da_est,
                                      param_grid=_param_grid,
                                      scoring=__scoring,
                                      refit=__refit,
                                      cv=5,
                                      error_score=np.nan,
                                      return_train_score=True,
                                      n_jobs=-1
        )

    elif _type == 'gstcv_sklearn':
        _gscv = GSTCV(
                                      estimator=_sk_est,
                                      param_grid=_param_grid,
                                      scoring=__scoring,
                                      refit=__refit,
                                      cv=5,
                                      error_score=np.nan,            
                                      return_train_score=True,
                                      n_jobs=-1
        
        )
        
    elif _type == 'gstcv_dask':
        _gscv = GSTCV(
                                      estimator=_da_est,
                                      param_grid=_param_grid,
                                      scoring=__scoring,
                                      refit=__refit,
                                      cv=5,
                                      error_score=np.nan,            
                                      return_train_score=True,
                                      n_jobs=-1
        )
    
    return _gscv


In [7]:
def key_handler(_trial, _METHOD_ARRAY_DICT):
    if _trial not in _METHOD_ARRAY_DICT:
        raise ValueError(f"trying to modify key {_trial} in METHOD_ARRAY_DICT but key doesnt exist")

def method_output_try_handler(_trial, method_name, _method_output, _METHOD_ARRAY_DICT):
    key_handler(_trial, _METHOD_ARRAY_DICT)
    _METHOD_ARRAY_DICT[_trial].loc[method_name, 'OUTPUT'] = _method_output
    return _METHOD_ARRAY_DICT

def method_output_except_handler(_trial, method_name, _exc_info, _METHOD_ARRAY_DICT):
    key_handler(_trial, _METHOD_ARRAY_DICT)
    _METHOD_ARRAY_DICT[_trial].loc[method_name, 'OUTPUT'] = _exc_info
    return _METHOD_ARRAY_DICT



COMBINATIONS = [f'{c}_{b}_{a}' for c in GOOD_OR_BAD_X for b in DTYPES for a in TYPES]
METHOD_ARRAY_DICT = {k:pd.DataFrame(index=METHOD_NAMES, columns=['OUTPUT'], dtype=object) for k in COMBINATIONS}

ctr = 0
for good_or_bad_x in GOOD_OR_BAD_X:
    for _dtype in DTYPES:
        for _gscv_type in TYPES:
            
            ctr += 1
            trial = f'{good_or_bad_x}_{_dtype}_{_gscv_type}'

            print(f'Running {ctr} of {len(COMBINATIONS)}... {trial}')
            
            test_cls = init_gscv(SklearnLogistic, DaskLogistic, _gscv_type)

            if _dtype == 'array':
                if 'dask' in _gscv_type:
                    base_y = good_da_y
                    base_X = good_da_X
                    if 'good' in good_or_bad_x: _X = good_da_X
                    elif 'bad' in good_or_bad_x: _X = bad_da_X
                    else: raise Exception(f"good_or_bad_x logic is failing")
                elif 'sklearn' in _gscv_type:
                    base_y = good_np_y                      
                    base_X = good_np_X
                    if 'good' in good_or_bad_x: _X = good_np_X
                    elif 'bad' in good_or_bad_x: _X = bad_np_X
                    else: raise Exception(f"good_or_bad_x logic is failing")
                else: raise Exception(f"_gscv_type logic is failing")
            elif _dtype == 'dataframe':
                if 'dask' in _gscv_type:
                    base_y = good_np_y    #good_ddf_y  UNPREDICABLE BEHAVIOR PASSING y AS DF TO DASK GridSearcH
                    base_X = good_ddf_X
                    if 'good' in good_or_bad_x: _X = good_ddf_X
                    elif 'bad' in good_or_bad_x: _X = bad_ddf_X
                    else: raise Exception(f"good_or_bad_x logic is failing")
                elif 'sklearn' in _gscv_type:
                    base_y = good_pd_y
                    base_X = good_pd_X
                    if 'good' in good_or_bad_x: _X = good_pd_X
                    elif 'bad' in good_or_bad_x: _X = bad_pd_X
                    else: raise Exception(f"good_or_bad_x logic is failing")
                else: raise Exception(f"_gscv_type logic is failing")
            else: raise Exception(f"_dtype logic is failing")

            
            print(f'trial = {trial}')

            try:
                test_cls.fit(base_X, base_y)
            except TypeError as e:
                for _m in METHOD_NAMES:
                    METHOD_ARRAY_DICT[trial].loc[_m, 'OUTPUT'] = e
                del test_cls, base_X, base_y
                continue 
            except Exception as e2:
                print(f"\033[91mExcepted for a reason other than dask.dataframe into dask logistic TypeError\033[0m")
                raise Exception(e2)


            del base_X, base_y
            
            try:
                __ = test_cls.decision_function(_X)
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'decision_function', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'decision_function', sys.exc_info()[1], METHOD_ARRAY_DICT)

            try:
                __ = test_cls.inverse_transform(_X)
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'inverse_transform', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'inverse_transform', sys.exc_info()[1], METHOD_ARRAY_DICT)

            try:
                __ = test_cls.predict(_X)
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'predict', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'predict', sys.exc_info()[1], METHOD_ARRAY_DICT)

            try:
                __ = test_cls.predict_log_proba(_X)
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'predict_log_proba', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'predict_log_proba', sys.exc_info()[1], METHOD_ARRAY_DICT)

            try:
                __ = test_cls.predict_proba(_X)
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'predict_proba', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'predict_proba', sys.exc_info()[1], METHOD_ARRAY_DICT)

            try:
                __ = test_cls.score_samples(_X)
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'score_samples', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'score_samples', sys.exc_info()[1], METHOD_ARRAY_DICT)

            try:
                __ = test_cls.transform(_X)
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'transform', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'transform', sys.exc_info()[1], METHOD_ARRAY_DICT)



            # REFERENCE ATTRS #################################################################################################
            try:
                __ = test_cls.n_features_in_
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'n_features_in_', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'n_features_in_', sys.exc_info()[1], METHOD_ARRAY_DICT)
            
            try:
                __ = test_cls.feature_names_in_
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'feature_names_in_', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'feature_names_in_', sys.exc_info()[1], METHOD_ARRAY_DICT)

            try:
                __ = test_cls.classes_
                METHOD_ARRAY_DICT = method_output_try_handler(trial, 'classes_', __, METHOD_ARRAY_DICT)
            except:
                METHOD_ARRAY_DICT = method_output_except_handler(trial, 'classes_', sys.exc_info()[1], METHOD_ARRAY_DICT)


            del test_cls




SINGLE_DF = pd.DataFrame(index=METHOD_NAMES, columns=list(METHOD_ARRAY_DICT.keys()), dtype='<U100').fillna('-')
for _key, DATA_DF in METHOD_ARRAY_DICT.items():
    SINGLE_DF.loc[:, _key] = DATA_DF.to_numpy().ravel()

SINGLE_DF = SINGLE_DF.drop(['score'], inplace=False)

if os.name =='posix': method_path = rf'/home/bear/Desktop/gscv_bad_X_comparison_dump__all_except_score.ods'
elif os.name=='nt': method_path = rf'c:\users\bill\desktop\gscv_bad_X_comparison_dump__all_except_score.csv'

SINGLE_DF.to_csv(method_path, index=True)

Running 1 of 16... good_X_array_sklearn
trial = good_X_array_sklearn
Running 2 of 16... good_X_array_dask
trial = good_X_array_dask




Running 3 of 16... good_X_array_gstcv_sklearn
trial = good_X_array_gstcv_sklearn
Running 4 of 16... good_X_array_gstcv_dask
trial = good_X_array_gstcv_dask




Running 5 of 16... good_X_dataframe_sklearn
trial = good_X_dataframe_sklearn


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Running 6 of 16... good_X_dataframe_dask
trial = good_X_dataframe_dask




Running 7 of 16... good_X_dataframe_gstcv_sklearn
trial = good_X_dataframe_gstcv_sklearn
Running 8 of 16... good_X_dataframe_gstcv_dask
trial = good_X_dataframe_gstcv_dask




Running 9 of 16... bad_X_array_sklearn
trial = bad_X_array_sklearn
Running 10 of 16... bad_X_array_dask
trial = bad_X_array_dask




Running 11 of 16... bad_X_array_gstcv_sklearn
trial = bad_X_array_gstcv_sklearn
Running 12 of 16... bad_X_array_gstcv_dask
trial = bad_X_array_gstcv_dask




Running 13 of 16... bad_X_dataframe_sklearn
trial = bad_X_dataframe_sklearn


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Running 14 of 16... bad_X_dataframe_dask
trial = bad_X_dataframe_dask




Running 15 of 16... bad_X_dataframe_gstcv_sklearn
trial = bad_X_dataframe_gstcv_sklearn
Running 16 of 16... bad_X_dataframe_gstcv_dask
trial = bad_X_dataframe_gstcv_dask




In [8]:
def key_handler(_trial, _METHOD_ARRAY_DICT):
    if _trial not in _METHOD_ARRAY_DICT:
        raise ValueError(f"trying to modify key {_trial} in METHOD_ARRAY_DICT but key doesnt exist")

def method_output_try_handler(_trial, method_name, _method_output, _METHOD_ARRAY_DICT):
    key_handler(_trial, _METHOD_ARRAY_DICT)
    _METHOD_ARRAY_DICT[_trial].loc[method_name, 'OUTPUT'] = _method_output
    return _METHOD_ARRAY_DICT

def method_output_except_handler(_trial, method_name, _exc_info, _METHOD_ARRAY_DICT):
    key_handler(_trial, _METHOD_ARRAY_DICT)
    _METHOD_ARRAY_DICT[_trial].loc[method_name, 'OUTPUT'] = _exc_info
    return _METHOD_ARRAY_DICT


COMBINATIONS = [f'{d}_{c}_{b}_{a}' for d in GOOD_OR_BAD_X for c in GOOD_OR_BAD_y for b in DTYPES for a in TYPES]
METHOD_ARRAY_DICT = {k:pd.DataFrame(index=METHOD_NAMES, columns=['OUTPUT'], dtype=object) for k in COMBINATIONS}

ctr = 0
for good_or_bad_x in GOOD_OR_BAD_X:
    for good_or_bad_y in GOOD_OR_BAD_y: 
        for _dtype in DTYPES:
            for _gscv_type in TYPES:
                ctr += 1
                trial = f'{good_or_bad_x}_{good_or_bad_y}_{_dtype}_{_gscv_type}'

                print(f'Running {ctr} of {len(COMBINATIONS)}... {trial}')

                test_cls = init_gscv(SklearnLogistic, DaskLogistic, _gscv_type)

                if _dtype == 'array':
                    if 'dask' in _gscv_type:
                        base_y = good_da_y
                        if 'good' in good_or_bad_y: _y = good_da_y
                        elif 'bad_features' in good_or_bad_y: _y = bad_features_da_y
                        elif 'bad_classes' in good_or_bad_y: _y = bad_classes_da_y
                        else: raise Exception(f"good_or_bad_y logic is failing")
                        base_X = good_da_X
                        if 'good' in good_or_bad_x: _X = good_da_X
                        elif 'bad' in good_or_bad_x: _X = bad_da_X
                        else: raise Exception(f"good_or_bad_x logic is failing")
                    elif 'sklearn' in _gscv_type:
                        base_y = good_np_y
                        if 'good' in good_or_bad_y: _y = good_np_y
                        elif 'bad_features' in good_or_bad_y: _y = bad_features_np_y
                        elif 'bad_classes' in good_or_bad_y: _y = bad_classes_np_y
                        else: raise Exception(f"good_or_bad_y logic is failing")                        
                        base_X = good_np_X
                        if 'good' in good_or_bad_x: _X = good_np_X
                        elif 'bad' in good_or_bad_x: _X = bad_np_X
                        else: raise Exception(f"good_or_bad_x logic is failing")
                    else: raise Exception(f"_gscv_type logic is failing")
                elif _dtype == 'dataframe':
                    if 'dask' in _gscv_type:
                        base_y = good_np_y    #good_ddf_y  UNPREDICABLE BEHAVIOR PASSING y AS DF TO DASK Logistic/GridSearch
                        if 'good' in good_or_bad_y: _y = good_ddf_y
                        elif 'bad_features' in good_or_bad_y: _y = bad_features_ddf_y
                        elif 'bad_classes' in good_or_bad_y: _y = bad_classes_ddf_y
                        else: raise Exception(f"good_or_bad_y logic is failing")
                        base_X = good_ddf_X
                        if 'good' in good_or_bad_x: _X = good_ddf_X
                        elif 'bad' in good_or_bad_x: _X = bad_ddf_X
                        else: raise Exception(f"good_or_bad_x logic is failing")
                    elif 'sklearn' in _gscv_type:
                        base_y = good_pd_y
                        if 'good' in good_or_bad_y: _y = good_pd_y
                        elif 'bad_features' in good_or_bad_y: _y = bad_features_pd_y
                        elif 'bad_classes' in good_or_bad_y: _y = bad_classes_pd_y
                        else: raise Exception(f"good_or_bad_y logic is failing")
                        base_X = good_pd_X
                        if 'good' in good_or_bad_x: _X = good_pd_X
                        elif 'bad' in good_or_bad_x: _X = bad_pd_X
                        else: raise Exception(f"good_or_bad_x logic is failing")
                    else: raise Exception(f"_gscv_type logic is failing")
                else: raise Exception(f"_dtype logic is failing")

                
                try: 
                    test_cls.fit(base_X, base_y)
                except TypeError as e:
                    for _m in METHOD_NAMES:
                        METHOD_ARRAY_DICT[trial].loc[_m, 'OUTPUT'] = e
                    del test_cls, base_X, base_y
                    continue
                except Exception as e2:
                    print(f"\033[91mExcepted for a reason other than dask.dataframe into dask logistic TypeError\033[0m")
                    raise Exception(e2)


                del base_X, base_y
            

                try:
                    __ = test_cls.score(_X, _y)
                    METHOD_ARRAY_DICT = method_output_try_handler(trial, 'score', __, METHOD_ARRAY_DICT)
                except:
                    METHOD_ARRAY_DICT = method_output_except_handler(trial, 'score', sys.exc_info()[1], METHOD_ARRAY_DICT)
    
    
                # REFERENCE ATTRS #################################################################################################
                try:
                    __ = test_cls.n_features_in_
                    METHOD_ARRAY_DICT = method_output_try_handler(trial, 'n_features_in_', __, METHOD_ARRAY_DICT)
                except:
                    METHOD_ARRAY_DICT = method_output_except_handler(trial, 'n_features_in_', sys.exc_info()[1], METHOD_ARRAY_DICT)
                
                try:
                    __ = test_cls.feature_names_in_
                    METHOD_ARRAY_DICT = method_output_try_handler(trial, 'feature_names_in_', __, METHOD_ARRAY_DICT)
                except:
                    METHOD_ARRAY_DICT = method_output_except_handler(trial, 'feature_names_in_', sys.exc_info()[1], METHOD_ARRAY_DICT)
    
                try:
                    __ = test_cls.classes_
                    METHOD_ARRAY_DICT = method_output_try_handler(trial, 'classes_', __, METHOD_ARRAY_DICT)
                except:
                    METHOD_ARRAY_DICT = method_output_except_handler(trial, 'classes_', sys.exc_info()[1], METHOD_ARRAY_DICT)



SINGLE_DF = pd.DataFrame(index=METHOD_NAMES, columns=list(METHOD_ARRAY_DICT.keys()), dtype='<U100').fillna('-')
for _key, DATA_DF in METHOD_ARRAY_DICT.items():
    SINGLE_DF.loc[:, _key] = DATA_DF.to_numpy().ravel()

SINGLE_DF = SINGLE_DF.drop(['decision_function','inverse_transform','predict','predict_log_proba','predict_proba',
                            'score_samples','transform'], inplace=False)

SINGLE_DF = SINGLE_DF.T

if os.name =='posix': method_path = rf'/home/bear/Desktop/gscv_bad_X_bad_y_comparison_dump__score.ods'
elif os.name=='nt': method_path = rf'c:\users\bill\desktop\gscv_bad_X_bad_y_comparison_dump__score.csv'

SINGLE_DF.to_csv(method_path, index=True)

Running 1 of 48... good_X_good_y_array_sklearn
Running 2 of 48... good_X_good_y_array_dask
Running 3 of 48... good_X_good_y_array_gstcv_sklearn
Running 4 of 48... good_X_good_y_array_gstcv_dask




Running 5 of 48... good_X_good_y_dataframe_sklearn


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Running 6 of 48... good_X_good_y_dataframe_dask
Running 7 of 48... good_X_good_y_dataframe_gstcv_sklearn
Running 8 of 48... good_X_good_y_dataframe_gstcv_dask




Running 9 of 48... good_X_bad_features_y_array_sklearn
Running 10 of 48... good_X_bad_features_y_array_dask
Running 11 of 48... good_X_bad_features_y_array_gstcv_sklearn
Running 12 of 48... good_X_bad_features_y_array_gstcv_dask




Running 13 of 48... good_X_bad_features_y_dataframe_sklearn


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Running 14 of 48... good_X_bad_features_y_dataframe_dask
Running 15 of 48... good_X_bad_features_y_dataframe_gstcv_sklearn
Running 16 of 48... good_X_bad_features_y_dataframe_gstcv_dask




Running 17 of 48... good_X_bad_classes_y_array_sklearn
Running 18 of 48... good_X_bad_classes_y_array_dask
Running 19 of 48... good_X_bad_classes_y_array_gstcv_sklearn
Running 20 of 48... good_X_bad_classes_y_array_gstcv_dask




Running 21 of 48... good_X_bad_classes_y_dataframe_sklearn


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Running 22 of 48... good_X_bad_classes_y_dataframe_dask
Running 23 of 48... good_X_bad_classes_y_dataframe_gstcv_sklearn
Running 24 of 48... good_X_bad_classes_y_dataframe_gstcv_dask




Running 25 of 48... bad_X_good_y_array_sklearn
Running 26 of 48... bad_X_good_y_array_dask
Running 27 of 48... bad_X_good_y_array_gstcv_sklearn
Running 28 of 48... bad_X_good_y_array_gstcv_dask




Running 29 of 48... bad_X_good_y_dataframe_sklearn


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Running 30 of 48... bad_X_good_y_dataframe_dask
Running 31 of 48... bad_X_good_y_dataframe_gstcv_sklearn
Running 32 of 48... bad_X_good_y_dataframe_gstcv_dask




Running 33 of 48... bad_X_bad_features_y_array_sklearn
Running 34 of 48... bad_X_bad_features_y_array_dask
Running 35 of 48... bad_X_bad_features_y_array_gstcv_sklearn
Running 36 of 48... bad_X_bad_features_y_array_gstcv_dask




Running 37 of 48... bad_X_bad_features_y_dataframe_sklearn


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Running 38 of 48... bad_X_bad_features_y_dataframe_dask
Running 39 of 48... bad_X_bad_features_y_dataframe_gstcv_sklearn
Running 40 of 48... bad_X_bad_features_y_dataframe_gstcv_dask




Running 41 of 48... bad_X_bad_classes_y_array_sklearn
Running 42 of 48... bad_X_bad_classes_y_array_dask
Running 43 of 48... bad_X_bad_classes_y_array_gstcv_sklearn
Running 44 of 48... bad_X_bad_classes_y_array_gstcv_dask




Running 45 of 48... bad_X_bad_classes_y_dataframe_sklearn


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Running 46 of 48... bad_X_bad_classes_y_dataframe_dask
Running 47 of 48... bad_X_bad_classes_y_dataframe_gstcv_sklearn
Running 48 of 48... bad_X_bad_classes_y_dataframe_gstcv_dask




In [9]:
# DONE

In [10]:
_rows, _cols = 20, 5
DATA = np.random.randint(0,10,(_rows,_cols))
Y = np.random.randint(0,2,_rows)

# AS ARRAY
sk_arr_X = DATA
# bad_sk_arr_X = np.random.randint(0,10,(_rows,2*_cols))
sk_arr_y = Y

# AS DF
sk_df_X = pd.DataFrame(data=DATA)#, columns=list(string.ascii_lowercase[:_cols]))
sk_df_y = pd.Series(Y).to_frame()


# AS ARRAY
da_arr_X = da.array(sk_arr_X)   # chunks=(_rows//2, _cols)
# bad_da_arr_X = da.array(bad_sk_arr_X)
da_arr_y = da.array(Y)

# AS DF
da_df_X = ddf.from_pandas(sk_df_X, chunksize=_rows)
da_df_y = ddf.from_pandas(sk_df_y, npartitions=1)

In [11]:
clf = sklearn_Logistic()

clf.fit(sk_df_X, sk_df_y)


  y = column_or_1d(y, warn=True)


In [12]:
# clf.feature_names_in_

In [13]:
SklearnLogistic = sklearn_Logistic(fit_intercept=True)

TEST_NP_GSCV = sklearn_GridSearchCV(
                                     SklearnLogistic,
                                     {'C':np.logspace(-3,3,7)},
                                     scoring=['balanced_accuracy','accuracy'],
                                     refit='balanced_accuracy',
                                     error_score='raise',
)

TEST_NP_GSCV.fit(sk_df_X, sk_df_y)
# TEST_NP_GSCV.predict_proba(sk_df_X)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [14]:
# TEST_NP_GSCV.feature_names_in_

In [15]:
clf = dask_Logistic(fit_intercept=False)

clf.fit(da_df_X, da_df_y)

TypeError: This estimator does not support dask dataframes. This might be resolved with one of

    1. ddf.to_dask_array(lengths=True)
    2. ddf.to_dask_array()  # may cause other issues because of unknown chunk sizes

In [None]:
# clf.features_names_in_

In [None]:
DaskLogistic = dask_Logistic(fit_intercept=False)

TEST_DA_GSCV = dask_GridSearchCV(
                                 DaskLogistic,
                                 {'C':np.logspace(-3,3,7)},
                                 scoring=['balanced_accuracy','accuracy'],
                                 refit='balanced_accuracy',
                                 error_score='raise',
)

TEST_DA_GSCV.fit(da_arr_X, da_arr_y)
TEST_DA_GSCV.predict_proba(da_arr_X).compute()

In [None]:
# TEST_DA_GSCV.feature_names_in_

In [None]:

TEST_DA_GSCV.score(da_arr_X, da_arr_y)