In [1]:
from sklearn import datasets
import pandas as pd

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [4]:
def abc(k, *val):
    if k < val[0]:
        return 0
    else:
        return 1

In [5]:
df.sl.apply(abc, args=(5,)).head()

0    1
1    0
2    0
3    0
4    1
Name: sl, dtype: int64

In [6]:
def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [7]:
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df.head()

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a


In [8]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [9]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [10]:
df["output"] = iris.target

In [11]:
df.head()

Unnamed: 0,sl_labeled,sw_labeled,pl_labeled,pw_labeled,output
0,b,c,a,a,0
1,a,b,a,a,0
2,a,c,a,a,0
3,a,c,a,a,0
4,a,c,a,a,0


In [12]:
def fit(data):
    output_name = data.columns[-1]
    features = data.columns[0:-1]
    counts = {}
    possible_outputs = set(data[output_name])
    for output in possible_outputs:
        counts[output] = {}
        smallData = data[data[output_name] == output]
        counts[output]["total_count"] = len(smallData)
        for f in features:
            counts[output][f] = {}
            possible_values = set(smallData[f])
            for value in possible_values:
                val_count = len(smallData[smallData[f] == value])
                counts[output][f][value] = val_count
    return counts

In [13]:
fit(df)

{0: {'total_count': 50,
  'sl_labeled': {'a': 28, 'b': 22},
  'sw_labeled': {'a': 1, 'b': 7, 'd': 10, 'c': 32},
  'pl_labeled': {'a': 50},
  'pw_labeled': {'a': 50}},
 1: {'total_count': 50,
  'sl_labeled': {'a': 3, 'b': 21, 'd': 2, 'c': 24},
  'sw_labeled': {'a': 13, 'b': 29, 'c': 8},
  'pl_labeled': {'b': 7, 'c': 43},
  'pw_labeled': {'b': 10, 'c': 40}},
 2: {'total_count': 50,
  'sl_labeled': {'a': 1, 'b': 5, 'd': 15, 'c': 29},
  'sw_labeled': {'a': 5, 'b': 28, 'd': 2, 'c': 15},
  'pl_labeled': {'d': 30, 'c': 20},
  'pw_labeled': {'d': 34, 'c': 16}}}