In [532]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [533]:
import sys
sys.path.insert(0, r"C:/projects/cobra")

In [534]:
import pandas as pd
import numpy as np
import random
from cobra.preprocessing import PreProcessor

# custom imports
from cobra.preprocessing import CategoricalDataProcessor
from cobra.preprocessing import KBinsDiscretizer
from cobra.preprocessing import TargetEncoder
import json


### 1. Generate data

In [535]:
size = 5000

# Create datetime column
dates = pd.date_range('2022-01-01', periods=size, freq='D')

# Create categorical variables
category_values = ['Category A', 'Category B', 'Category C']
cat_var1 = pd.Series(np.random.choice(category_values, size=size), dtype='category')
cat_var2 = pd.Series(np.random.choice(category_values, size=size), dtype='category')
cat_var3 = pd.Series(np.random.choice(category_values, size=size), dtype='category')

# Create continuous variables with different scales and distributions
cont_var1 = pd.Series(np.random.normal(loc=0, scale=1, size=size), name='cont_var1')
cont_var2 = pd.Series(np.random.uniform(low=0, high=10, size=size), name='cont_var2')
cont_var3 = pd.Series(np.random.exponential(scale=1, size=size), name='cont_var3')

# Create target variable
target = pd.Series(np.random.randint(2, size=size))

# Combine into a DataFrame
df = pd.DataFrame({'DateTime': dates, 'CategoryVar1': cat_var1,
                   'CategoryVar2': cat_var2, 'CategoryVar3': cat_var3,
                   'cont_var1': cont_var1, 'cont_var2': cont_var2, 'cont_var3': cont_var3,
                   'target': target})

In [536]:
df.dtypes

DateTime        datetime64[ns]
CategoryVar1          category
CategoryVar2          category
CategoryVar3          category
cont_var1              float64
cont_var2              float64
cont_var3              float64
target                   int32
dtype: object

In [537]:
df.head()

Unnamed: 0,DateTime,CategoryVar1,CategoryVar2,CategoryVar3,cont_var1,cont_var2,cont_var3,target
0,2022-01-01,Category B,Category B,Category C,-0.247175,8.258259,0.039901,1
1,2022-01-02,Category B,Category B,Category C,0.247006,1.234493,1.336691,1
2,2022-01-03,Category C,Category A,Category B,0.076415,5.059058,1.323273,1
3,2022-01-04,Category C,Category B,Category A,-0.306355,8.316857,0.077718,0
4,2022-01-05,Category C,Category C,Category B,-1.133514,8.773722,0.356009,1


In [538]:
df.columns

Index(['DateTime', 'CategoryVar1', 'CategoryVar2', 'CategoryVar3', 'cont_var1',
       'cont_var2', 'cont_var3', 'target'],
      dtype='object')

### 2. Fit preprocessor

In [539]:
continuous_vars = ['cont_var2', 'cont_var3', 'cont_var1']
discrete_vars= ['CategoryVar1', 'CategoryVar2', 'CategoryVar3'] #, 'DateTime'] [] 
target_col = "target"

In [540]:
model_type = "classification"

# using all Cobra's default parameters for preprocessing here
preprocessor = PreProcessor.from_params(
    model_type=model_type
)

random.seed(1212)
basetable = preprocessor.train_selection_validation_split(data=df,
                                                          train_prop=0.6,
                                                          selection_prop=0.25,
                                                          validation_prop=0.15)

The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.


In [541]:
preprocessor.fit(basetable[basetable["split"]=="train"],
                 continuous_vars=continuous_vars,
                 discrete_vars = discrete_vars,
                 target_column_name=target_col)

Starting to fit pipeline
Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 251.21it/s]
Fitting KBinsDiscretizer took 0.012943267822265625 seconds
Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 193.52it/s]
Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 26.85it/s]
Fitting categorical_data_processor class took 0.11171197891235352 seconds
Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 564.66it/s]
Fitting TargetEncoder took 0.015709400177001953 seconds
Fitting pipeline took 0.1843581199645996 seconds


In [542]:
basetable_transformed_orig = preprocessor.transform(basetable,
                                   continuous_vars=continuous_vars,
                                   discrete_vars=discrete_vars)
basetable_transformed_orig.head()

Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 130.81it/s]
Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 517.58it/s]
Transforming data took 0.06473207473754883 seconds


Unnamed: 0,DateTime,CategoryVar1,CategoryVar2,CategoryVar3,cont_var1,cont_var2,cont_var3,target,split,cont_var2_bin,...,cont_var1_bin,CategoryVar1_processed,CategoryVar2_processed,CategoryVar3_processed,CategoryVar1_enc,CategoryVar2_enc,CategoryVar3_enc,cont_var2_enc,cont_var3_enc,cont_var1_enc
0,2022-01-01,Category B,Category B,Category C,-0.247175,8.258259,0.039901,1,selection,8.0 - 9.0,...,-0.3 - 0.0,Category B,Category B,Category C,0.505584,0.530256,0.51773,0.516447,0.514851,0.494083
1,2022-01-02,Category B,Category B,Category C,0.247006,1.234493,1.336691,1,train,1.0 - 2.0,...,0.0 - 0.3,Category B,Category B,Category C,0.505584,0.530256,0.51773,0.521311,0.517986,0.529086
2,2022-01-03,Category C,Category A,Category B,0.076415,5.059058,1.323273,1,train,5.0 - 6.0,...,0.0 - 0.3,Category C,Category A,Category B,0.494939,0.461386,0.487052,0.510903,0.517986,0.529086
3,2022-01-04,Category C,Category B,Category A,-0.306355,8.316857,0.077718,0,selection,8.0 - 9.0,...,-0.5 - -0.3,Category C,Category B,Category A,0.494939,0.530256,0.488603,0.516447,0.514851,0.534884
4,2022-01-05,Category C,Category C,Category B,-1.133514,8.773722,0.356009,1,train,8.0 - 9.0,...,-1.3 - -0.8,Category C,Category C,Category B,0.494939,0.502463,0.487052,0.516447,0.484634,0.461078


In [543]:
#preprocessor._discretizer #._bins_by_column
#preprocessor._target_encoder.attributes_to_dict()
#preprocessor._discretizer.attributes_to_dict()
#preprocessor._target_encoder.attributes_to_dict()

### 3. Serialize the preprocessor

In [544]:
pipeline_serialized = preprocessor.serialize_pipeline()

with open(r"./model_json.json", "w") as file:
    file.write(json.dumps(pipeline_serialized, indent=4))
    
#pipeline_serialized

In [545]:
# Look into properties of preprocessors
#pipeline_serialized["target_encoder"] #._bins_by_column

### 4. De-serialize pipeline

In [547]:
# Read serialized pipeline from json
with open(r"./model_json.json", "r") as file:
    json_pipeline_serialized = json.load(file)

# Create new preprocessor object from serialized pipeline
new_preprocessor = PreProcessor.from_pipeline(json_pipeline_serialized)
#new_preprocessor = PreProcessor.from_pipeline(pipeline_serialized)

The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.


In [529]:
# Look into properties of preprocessors if needed
#new_preprocessor._discretizer.attributes_to_dict()

In [548]:
basetable_transformed = new_preprocessor.transform(basetable,
                                   continuous_vars=continuous_vars,
                                   discrete_vars=discrete_vars)
basetable_transformed.head()

Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 119.08it/s]
Applying target encoding...: 100%|█████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1009.14it/s]
Transforming data took 0.06331968307495117 seconds


Unnamed: 0,DateTime,CategoryVar1,CategoryVar2,CategoryVar3,cont_var1,cont_var2,cont_var3,target,split,cont_var2_bin,...,cont_var1_bin,CategoryVar1_processed,CategoryVar2_processed,CategoryVar3_processed,CategoryVar1_enc,CategoryVar2_enc,CategoryVar3_enc,cont_var2_enc,cont_var3_enc,cont_var1_enc
0,2022-01-01,Category B,Category B,Category C,-0.247175,8.258259,0.039901,1,selection,8.0 - 9.0,...,-0.3 - 0.0,Category B,Category B,Category C,0.505584,0.530256,0.51773,0.516447,0.514851,0.494083
1,2022-01-02,Category B,Category B,Category C,0.247006,1.234493,1.336691,1,train,1.0 - 2.0,...,0.0 - 0.3,Category B,Category B,Category C,0.505584,0.530256,0.51773,0.521311,0.517986,0.529086
2,2022-01-03,Category C,Category A,Category B,0.076415,5.059058,1.323273,1,train,5.0 - 6.0,...,0.0 - 0.3,Category C,Category A,Category B,0.494939,0.461386,0.487052,0.510903,0.517986,0.529086
3,2022-01-04,Category C,Category B,Category A,-0.306355,8.316857,0.077718,0,selection,8.0 - 9.0,...,-0.5 - -0.3,Category C,Category B,Category A,0.494939,0.530256,0.488603,0.516447,0.514851,0.534884
4,2022-01-05,Category C,Category C,Category B,-1.133514,8.773722,0.356009,1,train,8.0 - 9.0,...,-1.3 - -0.8,Category C,Category C,Category B,0.494939,0.502463,0.487052,0.516447,0.484634,0.461078


In [549]:
# Double check transformed basetable is the same
basetable_transformed_orig == basetable_transformed

Unnamed: 0,DateTime,CategoryVar1,CategoryVar2,CategoryVar3,cont_var1,cont_var2,cont_var3,target,split,cont_var2_bin,...,cont_var1_bin,CategoryVar1_processed,CategoryVar2_processed,CategoryVar3_processed,CategoryVar1_enc,CategoryVar2_enc,CategoryVar3_enc,cont_var2_enc,cont_var3_enc,cont_var1_enc
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4996,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4997,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4998,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
