In [3]:
import numpy as np
import pandas as pd
import string

from sklearn.model_selection import train_test_split as sklearn_train_test_split
from sklearn.datasets import make_classification as sklearn_make_classification
from sklearn.model_selection import GridSearchCV as sklearn_GridSearchCV
from sklearn.linear_model import LogisticRegression as sklearn_Logistic
from sklearn.metrics import balanced_accuracy_score

import dask.array as da
import dask.dataframe as ddf
from dask_ml.model_selection import train_test_split as dask_train_test_split
from dask_ml.datasets import make_classification as dask_make_classification
from dask_ml.model_selection import GridSearchCV as dask_GridSearchCV
from dask_ml.linear_model import LogisticRegression as dask_Logistic

In [2]:
%%time

_rows = 2_000_000
_columns = 10
COLUMNS = list(string.ascii_lowercase[:_columns])

sk_np_X = np.random.randint(0,10,(_rows,_columns))
sk_np_y = np.random.randint(0,2,(_rows,))

sk_df_X = pd.DataFrame(data=sk_np_X, columns=COLUMNS)
sk_df_y = pd.DataFrame(data=sk_np_y, columns=['Y'])

da_da_X = da.random.randint(0,10,(_rows,_columns)).rechunk((_rows//10, _columns))
da_da_y = da.random.randint(0,2,(_rows,)).rechunk((_rows//10,))

da_df_X = ddf.from_array(da_da_X, columns=COLUMNS, chunksize=(_rows//10,))
da_df_y = ddf.from_array(da_da_y, columns=['Y'], chunksize=(_rows//10,))

CPU times: user 475 ms, sys: 35.4 ms, total: 511 ms
Wall time: 549 ms


In [3]:
# BEARS LOOK AT NP ARRAYS ######################################################

In [4]:
%%time
sk_np_X1, sk_np_X_test, sk_np_y1, sk_np_y_test = sklearn_train_test_split(sk_np_X, sk_np_y, test_size=0.2)
sk_np_X_train, sk_np_X_val, sk_np_y_train, sk_np_y_val = sklearn_train_test_split(sk_np_X1, sk_np_y1, test_size=0.25)

CPU times: user 527 ms, sys: 99.4 ms, total: 626 ms
Wall time: 627 ms


In [5]:
%%time
sk_logistic_np = sklearn_Logistic(max_iter=10_000, tol=1e-6)

CPU times: user 24 µs, sys: 5 µs, total: 29 µs
Wall time: 34.6 µs


In [6]:
# VERIFY DTYPES
sk_np_X_train

array([[0, 0, 1, ..., 5, 1, 1],
       [6, 0, 1, ..., 4, 3, 0],
       [6, 2, 5, ..., 0, 5, 9],
       ...,
       [0, 3, 6, ..., 2, 1, 7],
       [7, 5, 3, ..., 5, 0, 0],
       [7, 4, 1, ..., 7, 7, 2]])

In [7]:
sk_np_y_train

array([1, 1, 1, ..., 0, 0, 0])

In [8]:
%%time
sk_logistic_np.fit(sk_np_X_train, sk_np_y_train)

CPU times: user 7.59 s, sys: 5.91 s, total: 13.5 s
Wall time: 3.99 s


In [9]:
# END BEARS LOOK AT NP ARRAYS ######################################################

In [10]:
### BEAR TRIES TO SPEED UP DASK ARRAYS ###############################################################################

In [11]:
%%time
da_da_X1, da_da_X_test, da_da_y1, da_y_test = dask_train_test_split(da_da_X, da_da_y, test_size=0.2)
da_da_X_train, da_da_X_val, da_da_y_train, da_da_y_val = dask_train_test_split(da_da_X1, da_da_y1, test_size=0.25)

CPU times: user 128 ms, sys: 4.16 ms, total: 132 ms
Wall time: 131 ms


In [12]:
%%time
da_da_X_train = da_da_X_train.rechunk(da_da_X_train.shape)
da_da_y_train = da_da_y_train.rechunk(da_da_y_train.shape)

CPU times: user 1.58 ms, sys: 239 µs, total: 1.82 ms
Wall time: 1.77 ms


In [13]:
%%time
da_logistic_da = dask_Logistic(max_iter=10_000, tol=1e-6)

CPU times: user 23 µs, sys: 11 µs, total: 34 µs
Wall time: 42 µs


In [14]:
# VERIFY DTYPES
da_da_X_train

Unnamed: 0,Array,Chunk
Bytes,91.55 MiB,91.55 MiB
Shape,"(1200000, 10)","(1200000, 10)"
Dask graph,1 chunks in 34 graph layers,1 chunks in 34 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 91.55 MiB 91.55 MiB Shape (1200000, 10) (1200000, 10) Dask graph 1 chunks in 34 graph layers Data type int64 numpy.ndarray",10  1200000,

Unnamed: 0,Array,Chunk
Bytes,91.55 MiB,91.55 MiB
Shape,"(1200000, 10)","(1200000, 10)"
Dask graph,1 chunks in 34 graph layers,1 chunks in 34 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [15]:
da_da_y_train

Unnamed: 0,Array,Chunk
Bytes,9.16 MiB,9.16 MiB
Shape,"(1200000,)","(1200000,)"
Dask graph,1 chunks in 34 graph layers,1 chunks in 34 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 9.16 MiB 9.16 MiB Shape (1200000,) (1200000,) Dask graph 1 chunks in 34 graph layers Data type int64 numpy.ndarray",1200000  1,

Unnamed: 0,Array,Chunk
Bytes,9.16 MiB,9.16 MiB
Shape,"(1200000,)","(1200000,)"
Dask graph,1 chunks in 34 graph layers,1 chunks in 34 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [16]:
%%time
da_logistic_da.fit(da_da_X_train, da_da_y_train)

CPU times: user 26.4 s, sys: 14.5 s, total: 40.9 s
Wall time: 14.1 s


In [17]:
### END BEAR TRIES TO SPEED UP DASK ARRAYS ###########################################################################

In [18]:
# BEARS LOOK AT SK DATAFRAMES ######################################################################################

In [19]:
sk_df_X = pd.DataFrame(data=np.random.randint(0,10,(_rows,_columns)), columns=COLUMNS)
sk_df_y = pd.DataFrame(data=np.random.randint(0,2,(_rows,)), columns=['Y'])

In [20]:
%%time
sk_df_X1, sk_df_X_test, sk_df_y1, sk_df_y_test = sklearn_train_test_split(sk_df_X, sk_df_y, test_size=0.2)
sk_df_X_train, sk_df_X_val, sk_df_y_train, sk_df_y_val = sklearn_train_test_split(sk_df_X1, sk_df_y1, test_size=0.25)

CPU times: user 503 ms, sys: 591 ms, total: 1.09 s
Wall time: 1.68 s


In [21]:
%%time
sk_logistic_df = sklearn_Logistic(max_iter=10_000, tol=1e-6)

CPU times: user 16 µs, sys: 9 µs, total: 25 µs
Wall time: 29.8 µs


In [22]:
# VERIFY DTYPES
sk_df_X_train

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
1008276,6,0,9,7,0,7,4,7,1,1
871586,1,7,7,5,0,6,2,4,9,5
114886,7,4,8,5,3,3,3,0,1,8
1173749,5,8,6,8,7,3,4,6,3,9
1819543,4,6,1,1,0,2,1,5,6,5
...,...,...,...,...,...,...,...,...,...,...
795888,4,8,5,9,7,4,4,8,4,0
1546960,5,8,2,6,4,4,7,9,0,7
443964,8,6,2,3,1,2,7,3,4,8
868626,8,7,3,0,8,0,2,8,1,7


In [23]:
sk_df_y_train

Unnamed: 0,Y
1008276,1
871586,1
114886,0
1173749,1
1819543,0
...,...
795888,1
1546960,1
443964,0
868626,0


In [24]:
%%time
sk_logistic_df.fit(sk_df_X_train, sk_df_y_train)

  y = column_or_1d(y, warn=True)


CPU times: user 5.17 s, sys: 3.95 s, total: 9.12 s
Wall time: 2.7 s


In [25]:
# BEARS LOOK AT SK DATAFRAMES ######################################################################################

In [26]:
### BEAR TRIES TO SPEED UP DASK DATAFRAMES ###############################################################################

In [None]:
# dask_Logistic CANT TAKE DDFs

In [33]:
### END BEAR TRIES TO SPEED UP DASK DATAFRAMES ###############################################################################

In [177]:
X = np.random.randint(0,10,(500,5))
y = np.random.randint(0,2,(500,))

In [178]:
tater = sklearn_GridSearchCV(
                                estimator=sklearn_Logistic(),
                                param_grid={'C':[100]},
                                scoring=['balanced_accuracy','accuracy'],
                                refit='balanced_accuracy',
                                return_train_score=True
)

In [179]:
tater.fit(X,y)

In [180]:
tater.predict_proba(X)

array([[0.6630659 , 0.3369341 ],
       [0.54025772, 0.45974228],
       [0.59861686, 0.40138314],
       [0.61227439, 0.38772561],
       [0.50342398, 0.49657602],
       [0.48583341, 0.51416659],
       [0.57533049, 0.42466951],
       [0.54567711, 0.45432289],
       [0.4888645 , 0.5111355 ],
       [0.57215832, 0.42784168],
       [0.58451924, 0.41548076],
       [0.60915883, 0.39084117],
       [0.45054874, 0.54945126],
       [0.65281796, 0.34718204],
       [0.62294853, 0.37705147],
       [0.60348188, 0.39651812],
       [0.59424478, 0.40575522],
       [0.4895843 , 0.5104157 ],
       [0.64756021, 0.35243979],
       [0.57774264, 0.42225736],
       [0.46901238, 0.53098762],
       [0.57575246, 0.42424754],
       [0.49587469, 0.50412531],
       [0.55095078, 0.44904922],
       [0.42689819, 0.57310181],
       [0.53081173, 0.46918827],
       [0.58377094, 0.41622906],
       [0.51800603, 0.48199397],
       [0.53825363, 0.46174637],
       [0.59097856, 0.40902144],
       [0.

In [181]:
tater.score(X, y)

0.5548095168588527

In [182]:
# SCORE BY balanced_accuracy_score
balanced_accuracy_score(y, tater.predict(X))

0.5548095168588527

In [183]:
DUM = pd.DataFrame(tater.cv_results_)
for _ in DUM:
    print(f"{_}".ljust(30) + f"{DUM[_].to_frame().to_numpy()[0][0]}")

mean_fit_time                 0.011661815643310546
std_fit_time                  0.002771497366633309
mean_score_time               0.004921436309814453
std_score_time                0.0010317684265991757
param_C                       100
params                        {'C': 100}
split0_test_balanced_accuracy 0.5202922077922079
split1_test_balanced_accuracy 0.5560064935064934
split2_test_balanced_accuracy 0.5048701298701299
split3_test_balanced_accuracy 0.46185064935064934
split4_test_balanced_accuracy 0.5515151515151515
mean_test_balanced_accuracy   0.5189069264069264
std_test_balanced_accuracy    0.03433378405226676
rank_test_balanced_accuracy   1
split0_train_balanced_accuracy0.5612855007473841
split1_train_balanced_accuracy0.5572065567125232
split2_train_balanced_accuracy0.5200400293886651
split3_train_balanced_accuracy0.5754984672291049
split4_train_balanced_accuracy0.5428165584415584
mean_train_balanced_accuracy  0.5513694225038471
std_train_balanced_accuracy   0.01881113446599720

In [184]:
  data=[[0.5 , 0.5 ],
        [0.25, 0.25],
        [0.5 , 0.5 ],
        [0.25, 0.25],
        [0.25, 0.25]],

In [185]:
new_tater = tater.set_params(estimator__C=10)

In [186]:
new_tater.predict_proba(X)

array([[0.6630659 , 0.3369341 ],
       [0.54025772, 0.45974228],
       [0.59861686, 0.40138314],
       [0.61227439, 0.38772561],
       [0.50342398, 0.49657602],
       [0.48583341, 0.51416659],
       [0.57533049, 0.42466951],
       [0.54567711, 0.45432289],
       [0.4888645 , 0.5111355 ],
       [0.57215832, 0.42784168],
       [0.58451924, 0.41548076],
       [0.60915883, 0.39084117],
       [0.45054874, 0.54945126],
       [0.65281796, 0.34718204],
       [0.62294853, 0.37705147],
       [0.60348188, 0.39651812],
       [0.59424478, 0.40575522],
       [0.4895843 , 0.5104157 ],
       [0.64756021, 0.35243979],
       [0.57774264, 0.42225736],
       [0.46901238, 0.53098762],
       [0.57575246, 0.42424754],
       [0.49587469, 0.50412531],
       [0.55095078, 0.44904922],
       [0.42689819, 0.57310181],
       [0.53081173, 0.46918827],
       [0.58377094, 0.41622906],
       [0.51800603, 0.48199397],
       [0.53825363, 0.46174637],
       [0.59097856, 0.40902144],
       [0.

In [187]:
new_tater.score(X, y)

0.5548095168588527

In [188]:
from GridSearchThresholdCV import GridSearchThresholdCV

In [189]:
test_gstcv = GridSearchThresholdCV(
                                    estimator=sklearn_Logistic(),
                                    param_grid={'C':[100]},
                                    scoring=['balanced_accuracy','accuracy'],
                                    refit='balanced_accuracy',
                                    thresholds=np.linspace(0,1,21),
                                    return_train_score=True
)

In [190]:
test_gstcv.fit(X,y)

In [191]:
test_gstcv.predict_proba(X)

array([[0.6630659 , 0.3369341 ],
       [0.54025772, 0.45974228],
       [0.59861686, 0.40138314],
       [0.61227439, 0.38772561],
       [0.50342398, 0.49657602],
       [0.48583341, 0.51416659],
       [0.57533049, 0.42466951],
       [0.54567711, 0.45432289],
       [0.4888645 , 0.5111355 ],
       [0.57215832, 0.42784168],
       [0.58451924, 0.41548076],
       [0.60915883, 0.39084117],
       [0.45054874, 0.54945126],
       [0.65281796, 0.34718204],
       [0.62294853, 0.37705147],
       [0.60348188, 0.39651812],
       [0.59424478, 0.40575522],
       [0.4895843 , 0.5104157 ],
       [0.64756021, 0.35243979],
       [0.57774264, 0.42225736],
       [0.46901238, 0.53098762],
       [0.57575246, 0.42424754],
       [0.49587469, 0.50412531],
       [0.55095078, 0.44904922],
       [0.42689819, 0.57310181],
       [0.53081173, 0.46918827],
       [0.58377094, 0.41622906],
       [0.51800603, 0.48199397],
       [0.53825363, 0.46174637],
       [0.59097856, 0.40902144],
       [0.

In [192]:
test_gstcv.score(X,y)

0.5589938208534034

In [193]:
test_gstcv.best_index_

0

In [194]:
test_gstcv.best_threshold_

0.45

In [195]:
# DUMP_DF = pd.DataFrame(test_gstcv.cv_results_)
# DUMP_DF.to_csv(r'/home/bear/Desktop/GSTCV_TEST_CV_RESULTS.ods')

In [196]:
tftssm = test_gstcv._TEST_FOLD_x_THRESHOLD_x_SCORER__SCORE_MATRIX
tftssm[:, 7, :]

masked_array(
  data=[[0.45      , 0.50162338],
        [0.47      , 0.51948052],
        [0.44      , 0.5       ],
        [0.44      , 0.47077922],
        [0.45      , 0.48787879]],
  mask=False,
  fill_value=1e+20)

In [197]:
tftssm.mean(axis=0)

masked_array(
  data=[[0.442     , 0.5       ],
        [0.442     , 0.5       ],
        [0.442     , 0.5       ],
        [0.442     , 0.5       ],
        [0.442     , 0.5       ],
        [0.442     , 0.49951299],
        [0.442     , 0.49716089],
        [0.45      , 0.49595238],
        [0.502     , 0.52735931],
        [0.558     , 0.55218254],
        [0.554     , 0.51890693],
        [0.546     , 0.49632756],
        [0.556     , 0.49915584],
        [0.56      , 0.50227273],
        [0.558     , 0.5       ],
        [0.558     , 0.5       ],
        [0.558     , 0.5       ],
        [0.558     , 0.5       ],
        [0.558     , 0.5       ],
        [0.558     , 0.5       ],
        [0.558     , 0.5       ]],
  mask=False,
  fill_value=1e+20)

In [198]:
new_test_gstcv = test_gstcv.set_params(estimator__C=10)

In [199]:
# for _thresh in np.linspace(0,1,21):
#     new_test_gstcv.best_threshold_ = _thresh
#     print(f'{_thresh}: {new_test_gstcv.score(X,y)}')

In [200]:
new_test_gstcv.best_threshold_

0.45