In [None]:
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

In [None]:
iris = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv', header=None)
iris.head()

Unnamed: 0,0,1,2,3,4
0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa


In [None]:
class Discretization:
    ''' A process that transforms quantitative data into qualitative data '''
    
    def __init__(cls):
        print('Data discretization process started')
        
    def get_new_intervals(cls, intervals, chi, min_chi):
        ''' To merge the interval based on minimum chi square value '''
        
        min_chi_index = np.where(chi == min_chi)[0][0]
        new_intervals = []
        skip = False
        done = False
        for i in range(len(intervals)):
            if skip:
                skip = False
                continue
            if i == min_chi_index and not done:
                t = intervals[i] + intervals[i+1]
                new_intervals.append([min(t), max(t)])
                skip = True
                done = True
            else:
                new_intervals.append(intervals[i])
        return new_intervals        
        
    def get_chimerge_intervals(cls, data, colName, label, max_intervals):
        ''' 1. Compute the χ 2 value for each pair of adjacent intervals
            2. Merge the pair of adjacent intervals with the lowest χ 2 value
            3. Repeat œ and  until χ 2 values of all adjacent pairs exceeds a threshold '''
        
        # Getting unique values of input column
        distinct_vals = np.unique(data[colName])
        
        # Getting unique values of output column
        labels = np.unique(data[label])
        
        # Initially set the value to zero for all unique output column values
        empty_count = {l: 0 for l in labels}
        intervals = [[distinct_vals[i], distinct_vals[i]] for i in range(len(distinct_vals))]
        while len(intervals) > max_intervals:
            chi = []
            for i in range(len(intervals)-1):
                
                # Find chi square for Interval 1
                row1 = data[data[colName].between(intervals[i][0], intervals[i][1])]
                # Find chi square for Interval 2
                row2 = data[data[colName].between(intervals[i+1][0], intervals[i+1][1])]
                total = len(row1) + len(row2)
                
                # Generate Contigency
                count_0 = np.array([v for i, v in {**empty_count, **Counter(row1[label])}.items()])
                count_1 = np.array([v for i, v in {**empty_count, **Counter(row2[label])}.items()])
                count_total = count_0 + count_1
                
                # Find the expected value by the following formula
                # Expected Value → ( Row Sum * Column Sum ) / Total Sum
                expected_0 = count_total*sum(count_0)/total
                expected_1 = count_total*sum(count_1)/total
                chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1
                
                # Store the chi value to find minimum chi value
                chi_ = np.nan_to_num(chi_)
                chi.append(sum(chi_))
            min_chi = min(chi)
            
            intervals = cls.get_new_intervals(intervals, chi, min_chi)
        print(' Min chi square value is ' + str(min_chi))
        return intervals

In [None]:
if __name__ == '__main__':
    max_intervals = 6
    obj = Discretization()
    for colName in iris.columns[0:-1]:
        print('\n Interval for', colName)
        intervals = obj.get_chimerge_intervals(iris, colName, iris.columns[-1], max_intervals)
        print(tabulate([[intervals]], tablefmt='fancy_grid'))

Data discretization process started

 Interval for 0
 Min chi square value is 5.172413793103449
╒════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕
│ [['4.3', '4.8'], ['4.9', '5.4'], ['5.5', '5.7'], ['5.8', '7.0'], ['7.1', '7.9'], ['sepal_length', 'sepal_length']] │
╘════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╛

 Interval for 1
 Min chi square value is 2.357142857142857
╒══════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕
│ [['2.0', '2.4'], ['2.5', '2.8'], ['2.9', '2.9'], ['3.0', '3.3'], ['3.4', '4.4'], ['sepal_width', 'sepal_width']] │
╘══════════════════════════════════════════════════════════════════════════════════════════════════════════════════╛

 Interval for 2
 Min chi square value is 1.8536931818181814
╒══════════════════════════════════════════════════════════════════════════