# Introduction

This notebook walk you through the __xverse__ package in detail. It provides codes for the same, so you can use this as a template to apply on your data.

# Import data

In [34]:
import numpy as np
import pandas as pd

df = pd.read_csv("./data/bank.csv", sep="|")

In [35]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,target
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,0


# 1. Feature Subset

This option is used to select a subset of features from the dataset. A list of features should be provided to subset. 

In [36]:
from xverse.feature_subset import FeatureSubset

numerical_features = list(df._get_numeric_data().columns)
categorical_features = list(df.columns.difference(numerical_features))
print(numerical_features)

clf = FeatureSubset(numerical_features)  # select only numeric features
df = clf.fit_transform(df)  # returns the dataframe with selected features

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'target']


In [37]:
df.head()


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,target
0,30,1787,19,79,1,-1,0,0
1,33,4789,11,220,1,339,4,0
2,35,1350,16,185,1,330,1,0
3,30,1476,3,199,4,-1,0,0
4,59,0,5,226,1,-1,0,0


# 2. Split X and Y

This option is used to split the dataset into X and y feature. All it needs is the target column as a list. Using the original dataframe again for this exercise.

In [38]:
df = pd.read_csv("./data/bank.csv", sep="|")

In [39]:
from xverse.feature_subset import SplitXY

clf = SplitXY(["target"])  # Split the dataset into X and y
X, y = clf.fit_transform(
    df
)  # returns features (X) dataset and target(Y) as a numpy array
(X, y)


(      age  balance  campaign   contact  day default  duration  education  \
 0      30     1787         1  cellular   19      no        79    primary   
 1      33     4789         1  cellular   11      no       220  secondary   
 2      35     1350         1  cellular   16      no       185   tertiary   
 3      30     1476         4   unknown    3      no       199   tertiary   
 4      59        0         1   unknown    5      no       226  secondary   
 ...   ...      ...       ...       ...  ...     ...       ...        ...   
 4516   33     -333         5  cellular   30      no       329  secondary   
 4517   57    -3313         1   unknown    9     yes       153   tertiary   
 4518   57      295        11  cellular   19      no       151  secondary   
 4519   28     1137         4  cellular    6      no       129  secondary   
 4520   44     1136         2  cellular    3      no       345   tertiary   
 
      housing            job loan  marital month  pdays poutcome  previous

In [40]:
X.head()

Unnamed: 0,age,balance,campaign,contact,day,default,duration,education,housing,job,loan,marital,month,pdays,poutcome,previous
0,30,1787,1,cellular,19,no,79,primary,no,unemployed,no,married,oct,-1,unknown,0
1,33,4789,1,cellular,11,no,220,secondary,yes,services,yes,married,may,339,failure,4
2,35,1350,1,cellular,16,no,185,tertiary,yes,management,no,single,apr,330,failure,1
3,30,1476,4,unknown,3,no,199,tertiary,yes,management,yes,married,jun,-1,unknown,0
4,59,0,1,unknown,5,no,226,secondary,yes,blue-collar,no,married,may,-1,unknown,0


In [41]:
y

array([0, 0, 0, ..., 0, 0, 0], shape=(4521,))

Going forward we will running this function iteratively to demonstrate each of the features. So, I created a prep_dataset option which is shown below. 

In [42]:
def prep_dataset():
    df = pd.read_csv("./data/bank.csv", sep="|")

    from xverse.feature_subset import SplitXY

    clf = SplitXY(["target"])  # Split the dataset into X and y
    X, y = clf.fit_transform(
        df
    )  # returns features (X) dataset and target(Y) as a numpy array

    return X, y

# 3. Monotonic Binning for numerical variables

Monotonically bin numeric variables based on target. The binning operation starts with the "max_bins" option. It iterates by reducing the number of bins, until it finds bins with monotonic relationship (either increasing or decreasing) between X and y. If the module is unable to find a monotonic relationship, it forcefully creates bins using the "force_bins" option. 

In [43]:
X, y = prep_dataset()

In [48]:
from xverse.transformer import MonotonicBinning

clf = MonotonicBinning()
clf.fit(X, y)
clf.transform(X)

initiated monotonic binning 
['age', 'balance', 'campaign', 'day', 'duration', 'pdays', 'previous']
bins - [19. 35. 45. 87.] & mapped to {'0': array([19., 35., 45., 87.])}


  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.group

bins - [-3313.           174.           979.33333333 71188.        ] & mapped to {'1': array([-3313.        ,   174.        ,   979.33333333, 71188.        ])}
bins - [ 1.  3. 50.] & mapped to {'2': array([ 1.,  3., 50.])}
bins - [ 1. 12. 20. 31.] & mapped to {'3': array([ 1., 12., 20., 31.])}
bins - [   4.          128.          261.33333333 3025.        ] & mapped to {'4': array([   4.        ,  128.        ,  261.33333333, 3025.        ])}
bins - [-1.00e+00 -5.00e-01  1.00e+00  8.71e+02] & mapped to {'5': array([-1.00e+00, -5.00e-01,  1.00e+00,  8.71e+02])}
bins - [ 0.  1. 25.] & mapped to {'6': array([ 0.,  1., 25.])}


  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.groupby("Bins", as_index=True)
  bins_X_grouped = bins_X.group

KeyError: 'duration'

In [45]:
clf.bins

{'0': array([19., 35., 45., 87.]),
 '1': array([-3313.        ,   174.        ,   979.33333333, 71188.        ]),
 '2': array([ 1.,  3., 50.]),
 '3': array([ 1., 12., 20., 31.]),
 '4': array([   4.        ,  128.        ,  261.33333333, 3025.        ]),
 '5': array([-1.00e+00, -5.00e-01,  1.00e+00,  8.71e+02]),
 '6': array([ 0.,  1., 25.])}

In [46]:
output_bins = clf.bins  # will be used later in this exercise

In [47]:
out_X = clf.transform(X)

KeyError: 'duration'

In [13]:
out_X.head()

NameError: name 'out_X' is not defined

## 3.1 Available options in the package for Monotonic binning

    Parameters
    ----------
    feature_names: 'all' or list (default='all')
        list of features to perform monotonic binning operation. 
        - 'all' (default): All features in the dataset will be used
        - list of features: ['age', 'income',......]
    
    max_bins: int (default=20)
        Maximum number of bins that can be created for any given variable. The final number of bins created will be less than or equal to this number.
        
    force_bins: int (default=3)
        It forces the module to create bins for a variable, when it cannot find monotonic relationship using "max_bins" option. The final number of bins created will be equal to the number specified.
        
    cardinality_cutoff: int (default=5)
        Cutoff to determine if a variable is eligible for monotonic binning operation. Any variable which has unique levels less than this number will be treated as character variables. At this point no binning operation will be performed on the variable and it will return the unique levels as bins for these variable.
    
    prefix: string (default=None)
        Variable prefix to be used for the column created by monotonic binning. 
        
    custom_binning: dict (default=None)
        Dictionary structure - {'feature_name': float list}
        Example - {'age': [0., 1., 2., 3.]}
        Using this parameter, the user can perform custom binning on variables. This parameter is also used to apply previously computed bins for each feature (Score new data). 

## 3.2 Using the custom binning option in the future to score new data - Monotonic binning

If you want to apply the bins on a new data, then simply use the transform function with the custom binning option.

In [16]:
X, y = prep_dataset()

In [None]:
clf = MonotonicBinning(custom_binning=output_bins)  # output_bins was created earlier

out_X = clf.transform(X)
out_X.head()

## 3.3 What happens if my data has missing values?

In [18]:
X, y = prep_dataset()

In [19]:
X = X.mask(np.random.random(X.shape) < 0.1)  # introduce some missing values randomly

In [None]:
out_X = clf.transform(X)
out_X.head()

As you see above, for the 'balance' feature, the missing value is not considered for binning operation. So, the output dataset will still have missing values. It is advised to impute missing values before you use this operation.

# 4. Weight of Evidence

In [21]:
X, y = prep_dataset()

In [None]:
from xverse.transformer import WOE

clf = WOE()
clf.fit(X, y)

In [None]:
clf.woe_df.head(10)

In [None]:
clf.iv_df

In [25]:
output_woe_bins = clf.woe_bins  # future transformation

In [None]:
output_woe_bins

In [27]:
output_mono_bins = clf.mono_custom_binning  # future transformation

In [None]:
output_mono_bins

In [None]:
clf.transform(X).head()

## 4.1 Available options in the package for WOE

    Parameters
    ----------
    feature_names: 'all' or list (default='all')
        list of features to perform WOE transformation. 
        - 'all' (default): All categorical features in the dataset will be used
        - list of features: ['age', 'income',......]
    
    exclude_features: list (default=None)
        list of features to be excluded from WOE transformation.
        - Example - ['age', 'income', .......]
        
    woe_prefix: string (default=None)
        Variable prefix to be used for the column created by WOE transformer. The default value is set 'None'.  
        
    treat_missing: {'separate', 'mode', 'least_frequent'} (default='separate')
        This parameter setting is used to handle missing values in the dataset.
        'separate' - Missing values are treated as a own group (category)
        'mode' - Missing values are combined with the highest frequent item in the dataset
        'least_frequent' - Missing values are combined with the least frequent item in the dataset
    
    woe_bins: dict of dicts(default=None)
        This feature is added as part of future WOE transformations or scoring. If this value is set, then WOE values provided for each of the features here will be used for transformation. Applicable only in the transform method. 
        Dictionary structure - {'feature_name': float list}
        Example - {'education': {'primary' : 0.1, 'tertiary' : 0.5, 'secondary', 0.7}}
    
    monotonic_binning: bool (default=True)
        This parameter is used to perform monotonic binning on numeric variables. If set to False, numeric variables would be ignored.
    
    mono_feature_names: 'all' or list (default='all')
        list of features to perform monotonic binning operation. 
        - 'all' (default): All features in the dataset will be used
        - list of features: ['age', 'income',......]
    
    mono_max_bins: int (default=20)
        Maximum number of bins that can be created for any given variable. The final number of bins created will be less than or equal to this number.
        
    mono_force_bins: int (default=3)
        It forces the module to create bins for a variable, when it cannot find monotonic relationship using "max_bins" option. The final number of bins created will be equal to the number specified.
        
    mono_cardinality_cutoff: int (default=5)
        Cutoff to determine if a variable is eligible for monotonic binning operation. Any variable which has unique levels less than this number will be treated as character variables. At this point no binning operation will be performed on the variable and it will return the unique levels as bins for these variable.
    
    mono_prefix: string (default=None)
        Variable prefix to be used for the column created by monotonic binning. 
        
    mono_custom_binning: dict (default=None)
        Using this parameter, the user can perform custom binning on variables. This parameter is also used to apply previously computed bins for each feature (Score new data).
        Dictionary structure - {'feature_name': float list}
        Example - {'age': [0., 1., 2., 3.]}

## 4.2 Using the custom binning option in the future to score new data - WOE

If you want to apply the bins on a new data, then simply use the transform function with the binning option available in WOE.

In [30]:
X, y = prep_dataset()

In [None]:
clf = WOE(
    woe_bins=output_woe_bins, mono_custom_binning=output_mono_bins
)  # output_bins was created earlier

out_X = clf.transform(X)
out_X.head()

## 4.3 What happens if my data has missing values?

Use one of the options below to handle missing values

    Parameters
    ----------
    treat_missing: {'separate', 'mode', 'least_frequent'} (default='separate')
        This parameter setting is used to handle missing values in the dataset.
        'separate' - Missing values are treated as a own group (category)
        'mode' - Missing values are combined with the highest frequent item in the dataset
        'least_frequent' - Missing values are combined with the least frequent item in the dataset

In [32]:
X, y = prep_dataset()

In [33]:
clf = WOE(treat_missing="mode")

In [None]:
clf.fit(X, y)

In [None]:
clf.transform(X).head()

In [36]:
clf.woe_prefix = "woe"  # use this if you want to create a new column instead of replacing the existing column

In [None]:
clf.transform(X).head()

## 4.4 Graph chart

In [38]:
from xverse.graph import BarCharts

In [39]:
woe_df = clf.woe_df

In [45]:
clf = BarCharts(bar_type="v")

In [None]:
# %matplotlib inline
clf.plot(woe_df)

# 5. VotingSelector

Select the input features for a binary model prediction using voting technique. Apply multiple feature selection techniques (Linear and Non linear) on the dataset and calculate the vote secured by all input features for a given binary target.

In [47]:
X, y = prep_dataset()

In [None]:
from xverse.ensemble import VotingSelector

clf = VotingSelector()
clf.fit(X, y)

In [None]:
clf.available_techniques

In [None]:
clf.feature_importances_

In [None]:
clf.feature_votes_

In [None]:
clf.transform(X).head()

## 5.1 Available options in the package for VotingSelector

    Parameters
    ----------
    
    feature_names: 'all' or list (default='all')
        list of features to perform WOE transformation. 
        'all' (default) - All categorical features in the dataset will be used
        list of features - ['age', 'income',......]
    
    exclude_features: list (default=None)
        list of features to be excluded from WOE transformation.
        - Example - ['age', 'income', .......]
    
    selection_techniques: 'all', 'quick' or list(default='all')
        List of selection techniques to be applied on the data. Available techniques - Weight of evidence ('WOE'), Random Forest ('RF'), Recursive Feature Elimination ('RFE'), Extra Trees Classifier ('ETC'), Chi Square ('CS'), L1 feature selection ('L_ONE').
        
        'all' - Apply all selection techniques ['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE']
        'quick' - ['WOE','RF','ETC']
        list - user provided list of feature selection techniques from available techniques 
    
    no_of_featues: 'auto', 'sqrt' or int(default='auto')
        Number of features to be selected by each selection technique.
        'auto' - len(features)/2
        'sqrt' - sqrt(len(features)) rounded to the lowest number
        int - user provided number in integer format
    
    handle_category= 'woe' or 'le' (default='woe')
        Handle category values transformation using Label encoder or Weight of Evidence option. Takes care of missing values too. It treats missing values as separate level.
        'woe' - use weight of evidence transformation
        'le' - use label encoder transformation
    
    numerical_missing_values= 'median', 'mean' or 0 (default='median')
        Handle numerical variable missing values.
        'median' - use median of the column
        'mean' - use mean of the column
        0 - use 0 to impute the missing values
    
    minimum_votes = int (default=0)
        Minimum number of votes needed to select a variable after feature selection. Only used in the transform process. Default value is set to 0 to select all variables.

## 5.2 Future transformation and select variables with minimum number of votes

In [53]:
clf.minimum_votes = 3  # select variables whihc got atleast 3 votes

In [None]:
clf.transform(X).head()

## 5.3 Subset feature selection option

In [None]:
clf = VotingSelector(selection_techniques=["WOE", "RF", "RFE", "ETC"])
clf.fit(X, y)

In [None]:
clf.feature_votes_

# 6. Pipeline feature

In [57]:
from sklearn.pipeline import Pipeline

clf = Pipeline(
    steps=[("split_x_y", SplitXY(["target"])), ("feature_votes", VotingSelector())]
)

In [None]:
clf.fit(df, df["target"])

In [None]:
clf.transform(df).head()