In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics

In [16]:
def gini_impurity(y):
  '''
  Given a Pandas Series, it calculates the Gini Impurity.
  y: variable with which calculate Gini Impurity.
  '''
  if isinstance(y, pd.Series):
    p = y.value_counts()/y.shape[0]
    gini = 1-np.sum(p**2)
    return(gini)

  else:
    raise('Object must be a Pandas Series.')

In [17]:
def entropy(y):
  '''
  Given a Pandas Series, it calculates the entropy.
  y: variable with which calculate entropy.
  '''
  if isinstance(y, pd.Series):
    a = y.value_counts()/y.shape[0]
    entropy = np.sum(-a*np.log2(a+1e-9))
    return(entropy)

  else:
    raise('Object must be a Pandas Series.')

In [18]:
def categorical_options(a):
  '''
  Creates all possible combinations from a Pandas Series.
  a: Pandas Series from where to get all possible combinations.
  '''
  a = a.unique()

  opt = []
  for L in range(0, len(a)+1):
      for subset in it.combinations(a, L):
          subset = list(subset)
          opt.append(subset)

  return opt[1:-1]


In [19]:
def variance(y):
  '''
  Function to help calculate the variance avoiding nan.
  y: variable to calculate variance to. It should be a Pandas Series.
  '''
  if(len(y) == 1):
    return 0
  else:
    return y.var()

In [20]:
def information_gain(y, mask, func=gini_impurity):
  '''
  It returns the Information Gain of a variable given a loss function.
  y: target variable.
  mask: split choice.
  func: function to be used to calculate Information Gain in case os classification.
  '''

  a = sum(mask)
  b = mask.shape[0] - a

  if(a == 0 or b ==0):
    ig = 0

  else:
    # if y.dtypes != 'O':
    #   ig = variance(y) - (a/(a+b)* variance(y[mask])) - (b/(a+b)*variance(y[-mask]))
    # else:
    ig = func(y)-a/(a+b)*func(y[mask])-b/(a+b)*func(y[-mask])

  return ig

In [21]:
def max_information_gain_split(x, y, func=gini_impurity):
  '''
  Given a predictor & target variable, returns the best split, the error and the type of variable based on a selected cost function.
  x: predictor variable as Pandas Series.
  y: target variable as Pandas Series.
  func: function to be used to calculate the best split.
  '''

  split_value = []
  ig = []

  numeric_variable = True if x.dtypes != 'O' else False

  # Create options according to variable type
  if numeric_variable:
    options = x.sort_values().unique()[1:]
  else:
    options = categorical_options(x)

  # Calculate ig for all values
  for val in options:
    mask =   x < val if numeric_variable else x.isin(val)
    val_ig = information_gain(y, mask, func)
    # Append results
    ig.append(val_ig)
    split_value.append(val)

  # Check if there are more than 1 results if not, return False
  if len(ig) == 0:
    return(None,None,None, False)

  else:
  # Get results with highest IG
    best_ig = max(ig)
    best_ig_index = ig.index(best_ig)
    best_split = split_value[best_ig_index]
    return(best_ig,best_split,numeric_variable, True)

In [22]:
data = pd.read_csv('BMIdata.csv')
data.head()

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3


In [23]:
data['obese'] = (data.Index >= 4).astype('int')
data.drop('Index', axis = 1, inplace = True)

In [24]:
gini_impurity(data['obese'])

0.45132799999999995

In [25]:
entropy(data['obese'])

0.9285950911297123

In [26]:
# information gain for weight
weight_ig, weight_split, _, _ = max_information_gain_split(data['Weight'], data['obese'],gini_impurity)

print(
  "The best split for Weight is when the variable is less than ",
  weight_split,"\nInformation Gain for that split is:", weight_ig
)

The best split for Weight is when the variable is less than  103 
Information Gain for that split is: 0.21261503330701417


In [27]:
weight_ig, weight_split, _, _ = max_information_gain_split(data['Weight'], data['obese'],entropy)
print(
  "The best split for Weight is when the variable is less than ",
  weight_split,"\nInformation Gain for that split is:", weight_ig
)

The best split for Weight is when the variable is less than  103 
Information Gain for that split is: 0.3824541370911895


In [28]:
# information gain for weight
weight_ig, weight_split, _, _ = max_information_gain_split(data['Weight'], data['obese'],variance)

print(
  "The best split for Weight is when the variable is less than ",
  weight_split,"\nInformation Gain for that split is:", weight_ig
)

The best split for Weight is when the variable is less than  103 
Information Gain for that split is: 0.10625190497954967
