# Binning in Python and Pandas  - Bernd Klein


Original source:  https://www.python-course.eu/pandas_python_binning.php

### Python approch 

**Copied from source**

In [77]:
def create_bins(lower_bound, width, quantity):
    """ create_bins returns an equal-width (distance) partitioning. 
        It returns an ascending list of tuples, representing the intervals.
        A tuple bins[i], i.e. (bins[i][0], bins[i][1])  with i > 0 
        and i < quantity, satisfies the following conditions:
            (1) bins[i][0] + width == bins[i][1]
            (2) bins[i-1][0] + width == bins[i][0] and
                bins[i-1][1] + width == bins[i][1]
    """
    bins = []
    for low in range(lower_bound, 
                     lower_bound + quantity*width + 1, width):
        bins.append((low, low+width))
    return bins

In [14]:
# bins = create_bins(lower_bound=10, width=20, quantity=5)
bins = create_bins(lower_bound=7, width=11, quantity=5)

bins

[(7, 18), (18, 29), (29, 40), (40, 51), (51, 62), (62, 73)]

**My twist**

In [15]:
def createBins(start, binWidth, numberOfBins):
    return [(low, low + binWidth) for low in range(start, start + binWidth * numberOfBins + 1, binWidth)]

In [21]:
bins = createBins(start=7, binWidth=11, numberOfBins=5)
bins

[(7, 18), (18, 29), (29, 40), (40, 51), (51, 62), (62, 73)]

**Copied from source**

In [17]:
def find_bin(value, bins):
    """ bins is a list of tuples, like [(0,20), (20, 40), (40, 60)],
        binning returns the smallest index i of bins so that
        bin[i][0] <= value < bin[i][1]
    """
    for i in range(0, len(bins)):
        if bins[i][0] <= value < bins[i][1]:
            return i
    return -1

In [22]:
bins

[(7, 18), (18, 29), (29, 40), (40, 51), (51, 62), (62, 73)]

In [23]:
find_bin(34,bins) #gives the index position

2

**My twist**

In [None]:
# it is good as is rewording will make it complex and no one likes complex thing :)

**Copied from source**

In [33]:
from collections import Counter

bins = create_bins(lower_bound=50,
                   width=4,
                   quantity=10)

print(bins)

weights_of_persons = [73.4, 69.3, 64.9, 75.6, 74.9, 80.3, 
                      78.6, 84.1, 88.9, 90.3, 83.4, 69.3, 
                      52.4, 58.3, 67.4, 74.0, 89.3, 63.4]

binned_weights = []

for value in weights_of_persons:
    bin_index = find_bin(value, bins)
    print(value, bin_index, bins[bin_index])
    binned_weights.append(bin_index)
    
frequencies = Counter(binned_weights)
print(frequencies)

[(50, 54), (54, 58), (58, 62), (62, 66), (66, 70), (70, 74), (74, 78), (78, 82), (82, 86), (86, 90), (90, 94)]
73.4 5 (70, 74)
69.3 4 (66, 70)
64.9 3 (62, 66)
75.6 6 (74, 78)
74.9 6 (74, 78)
80.3 7 (78, 82)
78.6 7 (78, 82)
84.1 8 (82, 86)
88.9 9 (86, 90)
90.3 10 (90, 94)
83.4 8 (82, 86)
69.3 4 (66, 70)
52.4 0 (50, 54)
58.3 2 (58, 62)
67.4 4 (66, 70)
74.0 6 (74, 78)
89.3 9 (86, 90)
63.4 3 (62, 66)
Counter({4: 3, 6: 3, 3: 2, 7: 2, 8: 2, 9: 2, 5: 1, 10: 1, 0: 1, 2: 1})


**My twist**

In [None]:
# it is good as is rewording will make it complex and no one likes complex thing :)

### Pandas approch 

In [34]:
import pandas as pd

bins2 = pd.IntervalIndex.from_tuples(bins)

In [35]:
bins2

IntervalIndex([(50, 54], (54, 58], (58, 62], (62, 66], (66, 70] ... (74, 78], (78, 82], (82, 86], (86, 90], (90, 94]],
              closed='right',
              dtype='interval[int64]')

In [36]:
# bins3= pd.IntervalIndex.from_arrays([1,3,4,5,5,6,6,7,5,4], closed='right')

In [37]:
categorical_object = pd.cut(weights_of_persons, bins2)

In [38]:
print(categorical_object)

[(70, 74], (66, 70], (62, 66], (74, 78], (74, 78], ..., (58, 62], (66, 70], (70, 74], (86, 90], (62, 66]]
Length: 18
Categories (11, interval[int64]): [(50, 54] < (54, 58] < (58, 62] < (62, 66] ... (78, 82] < (82, 86] < (86, 90] < (90, 94]]


In [39]:

bins2 = pd.IntervalIndex.from_tuples(bins, closed="left")
categorical_object = pd.cut(weights_of_persons, bins2)
print(categorical_object)

[[70, 74), [66, 70), [62, 66), [74, 78), [74, 78), ..., [58, 62), [66, 70), [74, 78), [86, 90), [62, 66)]
Length: 18
Categories (11, interval[int64]): [[50, 54) < [54, 58) < [58, 62) < [62, 66) ... [78, 82) < [82, 86) < [86, 90) < [90, 94)]


In [40]:
categorical_object = pd.cut(weights_of_persons, 18)

print(categorical_object)

[(71.35, 73.456], (69.244, 71.35], (62.928, 65.033], (75.561, 77.667], (73.456, 75.561], ..., (56.611, 58.717], (67.139, 69.244], (73.456, 75.561], (88.194, 90.3], (62.928, 65.033]]
Length: 18
Categories (18, interval[float64]): [(52.362, 54.506] < (54.506, 56.611] < (56.611, 58.717] < (58.717, 60.822] ... (81.878, 83.983] < (83.983, 86.089] < (86.089, 88.194] < (88.194, 90.3]]


In [43]:
categorical_object = pd.cut(weights_of_persons, 5) #shows which value will fall in which category and then shows all the categories
#basically makes it a Ordinal Category

print(categorical_object)

[(67.56, 75.14], (67.56, 75.14], (59.98, 67.56], (75.14, 82.72], (67.56, 75.14], ..., (52.362, 59.98], (59.98, 67.56], (67.56, 75.14], (82.72, 90.3], (59.98, 67.56]]
Length: 18
Categories (5, interval[float64]): [(52.362, 59.98] < (59.98, 67.56] < (67.56, 75.14] < (75.14, 82.72] < (82.72, 90.3]]


In [46]:
bins

[(50, 54),
 (54, 58),
 (58, 62),
 (62, 66),
 (66, 70),
 (70, 74),
 (74, 78),
 (78, 82),
 (82, 86),
 (86, 90),
 (90, 94)]

In [44]:
sequence_of_scalars = [ x[0] for x in bins]

In [45]:
sequence_of_scalars

[50, 54, 58, 62, 66, 70, 74, 78, 82, 86, 90]

In [47]:
sequence_of_scalars.append(bins[-1][1]) # add the last missing element

In [48]:
sequence_of_scalars

[50, 54, 58, 62, 66, 70, 74, 78, 82, 86, 90, 94]

In [49]:
categorical_object = pd.cut(weights_of_persons, 
                            sequence_of_scalars,
                            right=False)
print(categorical_object)

[[70, 74), [66, 70), [62, 66), [74, 78), [74, 78), ..., [58, 62), [66, 70), [74, 78), [86, 90), [62, 66)]
Length: 18
Categories (11, interval[int64]): [[50, 54) < [54, 58) < [58, 62) < [62, 66) ... [78, 82) < [82, 86) < [86, 90) < [90, 94)]


**Bin counts and value counts  - Copied from source**

In [50]:
pd.value_counts(categorical_object)

[74, 78)    3
[66, 70)    3
[86, 90)    2
[82, 86)    2
[78, 82)    2
[62, 66)    2
[90, 94)    1
[70, 74)    1
[58, 62)    1
[50, 54)    1
[54, 58)    0
dtype: int64

In [69]:
labels = categorical_object.codes
labels

array([ 5,  4,  3,  6,  6,  7,  7,  8,  9, 10,  8,  4,  0,  2,  4,  6,  9,
        3], dtype=int8)

In [70]:
categorical_object

[[70, 74), [66, 70), [62, 66), [74, 78), [74, 78), ..., [58, 62), [66, 70), [74, 78), [86, 90), [62, 66)]
Length: 18
Categories (11, interval[int64]): [[50, 54) < [54, 58) < [58, 62) < [62, 66) ... [78, 82) < [82, 86) < [86, 90) < [90, 94)]

In [71]:
print(weights_of_persons)

[73.4, 69.3, 64.9, 75.6, 74.9, 80.3, 78.6, 84.1, 88.9, 90.3, 83.4, 69.3, 52.4, 58.3, 67.4, 74.0, 89.3, 63.4]


In [72]:
categories = categorical_object.categories
categories

IntervalIndex([[50, 54), [54, 58), [58, 62), [62, 66), [66, 70) ... [74, 78), [78, 82), [82, 86), [86, 90), [90, 94)],
              closed='left',
              dtype='interval[int64]')

In [73]:
for index in range(len(weights_of_persons)):
    label_index = labels[index]
    print(weights_of_persons[index], "\t", label_index,"\t",  categories[label_index] )

73.4 	 5 	 [70, 74)
69.3 	 4 	 [66, 70)
64.9 	 3 	 [62, 66)
75.6 	 6 	 [74, 78)
74.9 	 6 	 [74, 78)
80.3 	 7 	 [78, 82)
78.6 	 7 	 [78, 82)
84.1 	 8 	 [82, 86)
88.9 	 9 	 [86, 90)
90.3 	 10 	 [90, 94)
83.4 	 8 	 [82, 86)
69.3 	 4 	 [66, 70)
52.4 	 0 	 [50, 54)
58.3 	 2 	 [58, 62)
67.4 	 4 	 [66, 70)
74.0 	 6 	 [74, 78)
89.3 	 9 	 [86, 90)
63.4 	 3 	 [62, 66)


**Naming bins - Copied from source**
- "summa cum laude" requires a GPA above 3.9
- "magna cum laude" if the GPA is above 3.8
- "cum laude" if the GPA of 3.6 or above

In [74]:
degrees = ["none", "cum laude", "magna cum laude", "summa cum laude"]
student_results = [3.93, 3.24, 2.80, 2.83, 3.91, 3.698, 3.731, 3.25, 3.24, 3.82, 3.22]

student_results_degrees = pd.cut(student_results, [0, 3.6, 3.8, 3.9, 4.0], labels=degrees)

In [75]:
student_results_degrees

['summa cum laude', 'none', 'none', 'none', 'summa cum laude', ..., 'cum laude', 'none', 'none', 'magna cum laude', 'none']
Length: 11
Categories (4, object): ['none' < 'cum laude' < 'magna cum laude' < 'summa cum laude']

In [76]:
labels = student_results_degrees.codes
categories = student_results_degrees.categories

for index in range(len(student_results)):
    label_index = labels[index]
    print(student_results[index], "\t", label_index, "\t", categories[label_index] )

3.93 	 3 	 summa cum laude
3.24 	 0 	 none
2.8 	 0 	 none
2.83 	 0 	 none
3.91 	 3 	 summa cum laude
3.698 	 1 	 cum laude
3.731 	 1 	 cum laude
3.25 	 0 	 none
3.24 	 0 	 none
3.82 	 2 	 magna cum laude
3.22 	 0 	 none
