## This notebook demonstrates how to use the feature handler using some examples.

## Example 1. Create some dummy numerical features, add to the handler and view the scaled (using standard z-score scaling) and unscaled features.

In [1]:
import pandas as pd

from cyclops.processors.feature_handler import FeatureHandler

feature_names = ["feature1", "feature2", "feature3"]
example_features = pd.DataFrame(index=[0, 1, 2, 4], columns=feature_names)
example_features.loc[0] = [10, 9.1, 3]
example_features.loc[1] = [14, 99.9, 0]
example_features.loc[2] = [13, 50, 0.8]
feature_handler = FeatureHandler(normalization_method="standard")
feature_handler.add_features(example_features)

print(feature_handler.features)
print(feature_handler.unscaled)
print(feature_handler.scaled)

2022-03-31 11:27:14,180 [1;37mINFO[0m cyclops.processors.feature_handler - Log file is /home/amritk/projects/cyclops/log.log


   feature1  feature2  feature3
0      10.0       9.1       3.0
1      14.0      99.9       0.0
2      13.0      50.0       0.8
4       NaN       NaN       NaN
   feature1  feature2  feature3
0      10.0       9.1       3.0
1      14.0      99.9       0.0
2      13.0      50.0       0.8
4       NaN       NaN       NaN
   feature1  feature2  feature3
0 -1.372813 -1.182345  1.366530
1  0.980581  1.263143 -0.998618
2  0.392232 -0.080798 -0.367912
4       NaN       NaN       NaN


## Example 2. Add same example features to the handler and view the scaled (using min-max scaling) and unscaled features.

In [2]:
feature_handler = FeatureHandler(normalization_method="min-max")
feature_handler.add_features(example_features)

print(feature_handler.features)
print(feature_handler.unscaled)
print(feature_handler.scaled)

   feature1  feature2  feature3
0      10.0       9.1       3.0
1      14.0      99.9       0.0
2      13.0      50.0       0.8
4       NaN       NaN       NaN
   feature1  feature2  feature3
0      10.0       9.1       3.0
1      14.0      99.9       0.0
2      13.0      50.0       0.8
4       NaN       NaN       NaN
   feature1  feature2  feature3
0      0.00  0.000000  1.000000
1      1.00  1.000000  0.000000
2      0.75  0.450441  0.266667
4       NaN       NaN       NaN


## Example 3. Add 2 numerical features and 1 categorical feature, inspect names and types of features.

#### Note that categorical features have to be encoded using string alphabet characters, else will be treated as numeric features. Missing rows in a categorical feature i.e. NaNs, are automatically converted to a new missing category. 

#### Categorical features with more than 2 categories are automatically one-hot encoded to binary variables.

In [3]:
feature_names = ["feature1", "feature2", "feature3"]
example_features = pd.DataFrame(index=[0, 1, 2, 4], columns=feature_names)
example_features.loc[0] = ["A", 9.1, 2]
example_features.loc[1] = ["B", 99.9, 1]
example_features.loc[2] = ["C", 50, 0]
example_features.loc[3] = ["A", 50, 3]

feature_handler = FeatureHandler()
feature_handler.add_features(example_features)

print(feature_handler.features)
print(feature_handler.scaled)
print(feature_handler.names)
print(feature_handler.types)

print(feature_handler.get_categorical_features())
print(feature_handler.get_numerical_features())

   feature1-A  feature1-B  feature1-C  feature1-MISSING_CATEGORY  feature2  \
0           1           0           0                          0       9.1   
1           0           1           0                          0      99.9   
2           0           0           1                          0      50.0   
4           0           0           0                          1       NaN   
3           1           0           0                          0      50.0   

   feature3  
0       2.0  
1       1.0  
2       0.0  
4       NaN  
3       3.0  
   feature1-A  feature1-B  feature1-C  feature1-MISSING_CATEGORY  feature2  \
0           1           0           0                          0 -1.340837   
1           0           1           0                          0  1.480669   
2           0           0           1                          0 -0.069916   
4           0           0           0                          1       NaN   
3           1           0           0                    

## Example 4. Add 2 numerical features, drop one of them.

In [4]:
feature_names = ["feature1", "feature2"]
example_features = pd.DataFrame(index=[0, 1, 2, 4], columns=feature_names)
example_features.loc[0] = [9.1, 2]
example_features.loc[1] = [99.9, 1]
example_features.loc[2] = [50, 0]
example_features.loc[3] = [50, 3]

feature_handler = FeatureHandler()
feature_handler.add_features(example_features)

print(feature_handler.features)
print(feature_handler.get_numerical_features())
feature_handler.drop_features("feature1")
print(feature_handler.features)
print(feature_handler.get_numerical_features())

   feature1  feature2
0       9.1       2.0
1      99.9       1.0
2      50.0       0.0
4       NaN       NaN
3      50.0       3.0
['feature1', 'feature2']
   feature2
0       2.0
1       1.0
2       0.0
4       NaN
3       3.0
['feature2']


## Example 5. Add 2 numerical features, make one of them a target variable.

In [5]:
feature_names = ["feature1", "feature2"]
example_features = pd.DataFrame(index=[0, 1, 2, 4], columns=feature_names)
example_features.loc[0] = [9.1, 2]
example_features.loc[1] = [99.9, 1]
example_features.loc[2] = [50, 0]
example_features.loc[3] = [50, 3]

feature_handler = FeatureHandler()
feature_handler.add_features(example_features)

print(feature_handler.features)
print(feature_handler.get_numerical_features())
feature_handler.set_targets("feature1")
print(feature_handler.targets)

   feature1  feature2
0       9.1       2.0
1      99.9       1.0
2      50.0       0.0
4       NaN       NaN
3      50.0       3.0
['feature1', 'feature2']
['feature1']
