In [13]:
import numpy as np
import pandas as pd

In [14]:
def information_value(target, feature):
    """
    Calculate the Information Value (IV) of a discrete variable.
    :param target: ndarray, actual values where 1 represents positive case and 0 represents negative case
    :param feature: ndarray, discrete variable to calculate IV
    :return: float, the Information Value of the input discrete feature
    """
    iv_table = pd.DataFrame({'feature': feature, 'y': target})
    tot_good = np.sum(target)
    tot_bad = len(target) - tot_good
    # iv_table = iv_table.groupby('feature').agg({
    #     'y': {
    #         'bad_count': lambda x: len(x) - np.sum(x),
    #         'good_count': np.sum
    #     }
    # })['y'] ### Old version
    iv_table = iv_table.groupby('feature').agg(bad_count = ('y', lambda x: len(x) - np.sum(x)), good_count = ('y', 'sum'))
    iv_table['bad_percent'] = iv_table['bad_count'] / tot_bad
    iv_table['good_percent'] = iv_table['good_count'] / tot_good
    iv_table['WOE'] = np.log(iv_table['good_percent'] / iv_table['bad_percent'])
    iv_table['IV'] = (iv_table['good_percent'] - iv_table['bad_percent']) * iv_table['WOE']
    iv_value = np.sum(iv_table['IV'])

    return iv_value, iv_table[['bad_count', 'bad_percent', 'good_count', 'good_percent', 'WOE', 'IV']]

In [15]:
titanic = pd.read_csv('data/titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
feature = titanic.Pclass
target = titanic.Survived

iv_value, iv_table = information_value(target, feature)

print(iv_table)
print('information value:', iv_value)

         bad_count  bad_percent  good_count  good_percent       WOE        IV
feature                                                                      
1               80     0.145719         136      0.397661  1.003916  0.252928
2               97     0.176685          87      0.254386  0.364485  0.028321
3              372     0.677596         119      0.347953 -0.666483  0.219701
information value: 0.500949737583947
