In [78]:
# !pip install frozendict

Collecting frozendict
  Downloading frozendict-2.3.4-cp39-cp39-win_amd64.whl (35 kB)
Installing collected packages: frozendict
Successfully installed frozendict-2.3.4


In [549]:
import pandas as pd
import numpy as np
from frozendict import frozendict
import random

In [428]:
df = pd.read_csv('breast_w/clean_breast_w.csv')

In [417]:
df.head()

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhension,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [224]:
df['class'].value_counts()

2    444
4    239
Name: class, dtype: int64

In [4]:
variables = df.columns[:-1]
target_col = df.columns[-1]
class_labels = df['class'].unique()

In [250]:
target_col

'class'

In [5]:
class_labels

array([2, 4], dtype=int64)

# Rule Generator (contains duplicate conditions and no pruning)

In [117]:
def convert_to_rule_items(x, variables, class_label, min_sup, len_D):
#     print(x)
    condset = tuple([(variable, x[variable]) for variable in variables])
    
    class_count = {}
#     When label don't exist in group
    for label in class_label:
        if label not in x:
            class_count[label] = 0
        else:
            class_count[label] = x[label]
#     print(class_count)
#     class_count = x[class_label].to_dict()
    condsup_count = sum(class_count.values())
    # Removing non frequent itemset
    if (condsup_count/float(len_D))<min_sup:
#         print('Pruning')
        return {}
#     print({condset: [condsup_count, class_count]})
    return {condset: (condsup_count, class_count)}

In [118]:
# RG Line 2 genRules(F_k)

def gen_rules(F_k, len_D, min_conf):
    
    rules = []
    
    for condset, (condsup_count, class_count) in F_k.items():
        major_class = max(class_count, key=lambda x: class_count[x])
        
        conf = class_count[major_class] / float(condsup_count)
        
        if conf>min_conf:
            # Checking if the support are the same for all classes
            if set(class_count.values())==1:
                # Choose a random class
                major_class = random.choice(class_count.keys())
                
            rules.append((condset,major_class))
            
    return rules

In [119]:
def gen_itemset(F, df, target_col, variables):
    F_new = {}
    variables = set(variables)
    for condset in F.keys():
#         print(f'New Set: {condset}')
        temp = df.copy()
        cols = []
        for item in condset:
            value = item[1]
            col = item[0]
            temp = temp[temp[col]==value]
            
            # Ensure that subsequent itemset generation do not generate same itemset
            # This 1 line cuts down a lot of permutations of the same itemset
#             df = df[df[col]!=value]
            cols.append(col)
        remaining = variables - set(cols)
        # TODO: Currently there is both candidate generation
        for variable in remaining:
            # Candidate generation
            groupby = cols+[variable, target_col]
            
            # Calculating frequency
            group = temp.groupby(by=groupby).size().unstack(level=-1).reset_index()
            group = group.fillna(0)
            
            # Converting candidates in frequent itemset
            group.apply(lambda x: F_new.update(convert_to_rule_items(x, groupby[:-1], class_labels, min_sup, len_D)), axis=1)


    return F_new

In [120]:
%%time
# RG Line 1
# Format of F_k: 
# key = itemset
# value = [no. of appearance of itemset, dictionary with key as class_label and value as no. of class_label in itemset]

min_sup=0.2
min_conf=0.7
len_D = len(df)

k=1
F_1 = {}
for variable in variables:
    temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
    temp = temp.fillna(0)
#     print(temp)
    temp.apply(lambda x: F_1.update(convert_to_rule_items(x, [variable], class_labels, min_sup, len_D)), axis=1)

CARS = {}
CARS[1] = gen_rules(F_1, len_D, min_conf)

F = F_1
print(F)
# print(F_1.keys())
# print(frequent_variables)
while len(F)!=0:
    k+=1
    # Includes generating candidate itemset and finding frequent itemset
    F = gen_itemset(F, df, target_col, variables)
    print(F)
    # Build rules from frequent itemset
    CARS[k] = gen_rules(F, len_D, min_conf)
    

{(('clump_thickness', 1.0),): (139.0, {2: 136.0, 4: 3.0}), (('uniformity_of_cell_size', 1.0),): (373.0, {2: 369.0, 4: 4.0}), (('uniformity_of_cell_shape', 1.0),): (346.0, {2: 344.0, 4: 2.0}), (('marginal_adhension', 1.0),): (393.0, {2: 363.0, 4: 30.0}), (('single_epithelial_cell_size', 2.0),): (376.0, {2: 355.0, 4: 21.0}), (('bare_nuclei', 1.0),): (402.0, {2: 387.0, 4: 15.0}), (('bland_chromatin', 1.0),): (150.0, {2: 148.0, 4: 2.0}), (('bland_chromatin', 2.0),): (160.0, {2: 153.0, 4: 7.0}), (('bland_chromatin', 3.0),): (161.0, {2: 125.0, 4: 36.0}), (('normal_nucleoli', 1.0),): (432.0, {2: 391.0, 4: 41.0}), (('mitoses', 1.0),): (563.0, {2: 431.0, 4: 132.0})}
{(('uniformity_of_cell_size', 1.0), ('normal_nucleoli', 1.0)): (345.0, {2: 344.0, 4: 1.0}), (('uniformity_of_cell_size', 1.0), ('uniformity_of_cell_shape', 1.0)): (324.0, {2: 322.0, 4: 2.0}), (('uniformity_of_cell_size', 1.0), ('bare_nuclei', 1.0)): (336.0, {2: 336.0, 4: 0.0}), (('uniformity_of_cell_size', 1.0), ('single_epithelial_

{(('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1)): (311, {2: 311, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('marginal_adhension', 1)): (277, {2: 277, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('single_epithelial_cell_size', 2)): (270, {2: 270, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('uniformity_of_cell_shape', 1)): (274, {2: 274, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('marginal_adhension', 1), ('bare_nuclei', 1)): (277, {2: 277, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('marginal_adhension', 1), ('mitoses', 1)): (294, {2: 294, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('marginal_adhension', 1), ('single_epithelial_cell_size', 2)): (256, {2: 256, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('marginal_adhension', 1), ('u

{(('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('marginal_adhension', 1)): (272, {2: 272, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('single_epithelial_cell_size', 2)): (265, {2: 265, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('uniformity_of_cell_shape', 1)): (269, {2: 269, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('mitoses', 1)): (272, {2: 272, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('single_epithelial_cell_size', 2)): (238, {2: 238, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1)): (242, {2: 242, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('sing

{(('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1)): (237, {2: 237, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('marginal_adhension', 1), ('single_epithelial_cell_size', 2)): (233, {2: 233, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('single_epithelial_cell_size', 2), ('uniformity_of_cell_shape', 1)): (224, {2: 224, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('single_epithelial_cell_size', 2), ('marginal_adhension', 1)): (233, {2: 233, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('uniformity_of_cell_shape', 1), ('single_epithelial_cell_size', 2)): (224, {2: 224, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1

{(('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1), ('single_epithelial_cell_size', 2)): (199, {2: 199, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('marginal_adhension', 1), ('single_epithelial_cell_size', 2), ('uniformity_of_cell_shape', 1)): (199, {2: 199, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('single_epithelial_cell_size', 2), ('uniformity_of_cell_shape', 1), ('marginal_adhension', 1)): (199, {2: 199, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('single_epithelial_cell_size', 2), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1)): (199, {2: 199, 4: 0}), (('uniformity_of_cell_size', 1), ('normal_nucleoli', 1), ('bare_nuclei', 1), ('mitoses', 1), ('uniformity_of_cell_shape', 1), ('single_epithelial_c

{}
CPU times: total: 2min 34s
Wall time: 2min 34s


In [121]:
CARS

{1: [((('clump_thickness', 1.0),), 2),
  ((('uniformity_of_cell_size', 1.0),), 2),
  ((('uniformity_of_cell_shape', 1.0),), 2),
  ((('marginal_adhension', 1.0),), 2),
  ((('single_epithelial_cell_size', 2.0),), 2),
  ((('bare_nuclei', 1.0),), 2),
  ((('bland_chromatin', 1.0),), 2),
  ((('bland_chromatin', 2.0),), 2),
  ((('bland_chromatin', 3.0),), 2),
  ((('normal_nucleoli', 1.0),), 2),
  ((('mitoses', 1.0),), 2)],
 2: [((('uniformity_of_cell_size', 1.0), ('normal_nucleoli', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('uniformity_of_cell_shape', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('bare_nuclei', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('single_epithelial_cell_size', 2.0)),
   2),
  ((('uniformity_of_cell_size', 1.0), ('marginal_adhension', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('mitoses', 1.0)), 2),
  ((('uniformity_of_cell_shape', 1.0), ('normal_nucleoli', 1.0)), 2),
  ((('uniformity_of_cell_shape', 1.0), ('bare_nuclei', 1.0)), 2),
  ((('unifor

# Rule Generator (no duplicate set of conditions & includes pruning)

Weights can be included to balance classes

In [237]:
def convert_to_rule_items(x, variables, class_label, min_sup, len_D):
#     print(x)
    condset = {}
    for variable in variables:
        condset[variable] = x[variable]
        
    condset = frozendict(condset)
    class_count = {}
#     When label don't exist in group
    for label in class_label:
        if label not in x:
            class_count[label] = 0
        else:
            class_count[label] = x[label]
#     print(class_count)
#     class_count = x[class_label].to_dict()
    major_class = max(class_count, key=lambda x: class_count[x])
    
    # Removing non frequent itemset
    if (class_count[major_class]/float(len_D))<min_sup:
        return {}
#     print({condset: [condsup_count, class_count]})
    return {condset: (major_class, class_count)}

In [238]:
# RG Line 2 genRules(F_k)

def gen_rules(F_k, len_D, min_conf):
    
    rules = {}
    
    for condset, (major_class, class_count) in F_k.items():
        
        conf = class_count[major_class] / float(sum(class_count.values()))
        
        if conf>min_conf:
            # Checking if the support are the same for all classes
            if set(class_count.values())==1:
                # Choose a random class
                major_class = random.choice(class_count.keys())
                
            rules[condset] = (major_class, class_count)
            
    return rules

In [239]:
def gen_itemset(F, df, target_col, itemsets):
    F_new = {}
    itemsets = set(itemsets)
    
    condset = list(F.keys())
    
#     for cond in condset
    for i in range(len(condset)):
        cond = condset[i]
#         print(f'New Set: {condset}')
        temp = df.copy()
        cols = []
        for item in cond.items():
            value = item[1]
            col = item[0]
            temp = temp[temp[col]==value]
            cols.append(col)

#         for itemset in condset
        for j in range(i+1, len(condset)):
            itemset = condset[j]
            #Line 20 to 25 is the Apriori principle, where we merge 2 frequent superset into a candidate key
            # Checking if 2 itemsets differ only by 2 conditions
            itemset_keys = itemset.keys()
            cond_keys = cond.keys()
            if len(set(itemset_keys) - set(cond_keys))==1 and len(set(itemset.items())^set(cond.items()))==2:
                variable = (set(itemset_keys) - set(cond_keys)).pop()
#                 print(itemset)
                value = itemset.get(variable)
                    
                # Candidate generation
                temp_2 = temp[temp[variable]==value]
                groupby = cols+[variable, target_col]

                # Calculating frequency
                # TODO: Can be optimize further as we already filtered out the candidate, hence only need groupby class but this will affect convert_to_rule_items
                group = temp_2.groupby(by=groupby).size().unstack(level=-1).reset_index()
                group = group.fillna(0)
                # Converting candidates in frequent itemset
                group.apply(lambda x: F_new.update(convert_to_rule_items(x, groupby[:-1], class_labels, min_sup, len_D)), axis=1)


    return F_new

In [240]:
def prune_rules(r, r_):
    # r and r_ is a python dict
    to_remove = []
    for superset in r.keys():
        # superset is a frozendict
        superset_set = set(superset.items())
        superset_value = r[superset]
        
        for subset in r_.keys():
            # subset is a frozendict
            subset_set = set(subset.items())
            
            if subset_set<superset_set:
                subset_value = r_[subset]
                
                superset_class = superset_value[0]
                superset_error = superset_value[1][superset_class] / sum(superset_value[1].values())
                
                subset_class = subset_value[0]
                subset_error = subset_value[1][subset_class] / sum(subset_value[1].values())
                
                if superset_error>=subset_error:
                    to_remove.append(superset)
                    # break to as the superset has already been removed, no further testing is needed
                    break
#     print(r)
#     print(to_remove)
    for key in to_remove:
        r.pop(key)
    return r

In [241]:
%%time
# RG Line 1
# Format of F_k: 
# key = itemset
# value = [no. of appearance of itemset, dictionary with key as class_label and value as no. of class_label in itemset]

min_sup=0.2
min_conf=0.7
len_D = len(df)

k=1
F_1 = {}
for variable in variables:
    temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
    temp = temp.fillna(0)
#     print(temp)
    temp.apply(lambda x: F_1.update(convert_to_rule_items(x, [variable], class_labels, min_sup, len_D)), axis=1)

CARS_1 = {}
CARS_1[1] = gen_rules(F_1, len_D, min_conf)

F = {}
F[1] = F_1
# print(F_1.keys())
frequent_variables = set([x for x in F_1.keys()])
# print(frequent_variables)
while len(F[k])!=0:
    k+=1
    # Includes generating candidate itemset and finding frequent itemset
    F[k] = gen_itemset(F[k-1], df, target_col, frequent_variables)
#     print(F)
    frequent_variables = set([x for x in F[k].keys()])
    
    # Build rules from frequent itemset
#     CARS_1.update(gen_rules(F, len_D, min_conf))
    CARS_1[k] = gen_rules(F[k], len_D, min_conf)
#     print(len(CARS_1[k]))
    # Pruning rules
    CARS_1[k] = prune_rules(CARS_1[k], CARS_1[k-1])
#     print(len(CARS_1[k]))

CPU times: total: 2.19 s
Wall time: 2.19 s


In [443]:
F

{1: {frozendict.frozendict({'uniformity_of_cell_size': 1.0}): (2,
   {2: 369.0, 4: 4.0}),
  frozendict.frozendict({'uniformity_of_cell_shape': 1.0}): (2,
   {2: 344.0, 4: 2.0}),
  frozendict.frozendict({'marginal_adhension': 1.0}): (2, {2: 363.0, 4: 30.0}),
  frozendict.frozendict({'single_epithelial_cell_size': 2.0}): (2,
   {2: 355.0, 4: 21.0}),
  frozendict.frozendict({'bare_nuclei': 1.0}): (2, {2: 387.0, 4: 15.0}),
  frozendict.frozendict({'bland_chromatin': 1.0}): (2, {2: 148.0, 4: 2.0}),
  frozendict.frozendict({'bland_chromatin': 2.0}): (2, {2: 153.0, 4: 7.0}),
  frozendict.frozendict({'normal_nucleoli': 1.0}): (2, {2: 391.0, 4: 41.0}),
  frozendict.frozendict({'mitoses': 1.0}): (2, {2: 431.0, 4: 132.0})},
 2: {frozendict.frozendict({'uniformity_of_cell_size': 1, 'uniformity_of_cell_shape': 1}): (2,
   {2: 322, 4: 2}),
  frozendict.frozendict({'uniformity_of_cell_size': 1, 'marginal_adhension': 1}): (2,
   {2: 320, 4: 2}),
  frozendict.frozendict({'uniformity_of_cell_size': 1, '

In [442]:
CARS_1

{1: {frozendict.frozendict({'uniformity_of_cell_size': 1.0}): (2,
   {2: 369.0, 4: 4.0}),
  frozendict.frozendict({'uniformity_of_cell_shape': 1.0}): (2,
   {2: 344.0, 4: 2.0}),
  frozendict.frozendict({'marginal_adhension': 1.0}): (2, {2: 363.0, 4: 30.0}),
  frozendict.frozendict({'single_epithelial_cell_size': 2.0}): (2,
   {2: 355.0, 4: 21.0}),
  frozendict.frozendict({'bare_nuclei': 1.0}): (2, {2: 387.0, 4: 15.0}),
  frozendict.frozendict({'bland_chromatin': 1.0}): (2, {2: 148.0, 4: 2.0}),
  frozendict.frozendict({'bland_chromatin': 2.0}): (2, {2: 153.0, 4: 7.0}),
  frozendict.frozendict({'normal_nucleoli': 1.0}): (2, {2: 391.0, 4: 41.0}),
  frozendict.frozendict({'mitoses': 1.0}): (2, {2: 431.0, 4: 132.0})},
 2: {},
 3: {frozendict.frozendict({'uniformity_of_cell_size': 1, 'uniformity_of_cell_shape': 1, 'marginal_adhension': 1}): (2,
   {2: 282, 4: 1}),
  frozendict.frozendict({'uniformity_of_cell_size': 1, 'uniformity_of_cell_shape': 1, 'single_epithelial_cell_size': 2}): (2,
   

# Classifier

In [285]:
def sort_rules(CARS, len_D):
    sorted_CARS = []
    for values in CARS.values():
        sorted_CARS.extend(list(values.items()))
#     print(temp)
    sorted_CARS = sorted(sorted_CARS, key=lambda x: (x[1][1][x[1][0]] / sum(x[1][1].values()), x[1][1][x[1][0]]/len_D, len(x[0])), reverse=True)
#     print(temp)
    return sorted_CARS

In [361]:
def build_classifier(df, CARS, len_D, target_col):
    sorted_CARS = sort_rules(CARS, len_D)
    temp_df = df
    rules = []
    for CARS in sorted_CARS:
        cond, result = CARS
        cond_df = temp_df.loc[(temp_df[list(cond)] == pd.Series(cond)).all(axis=1)]
        correct = cond_df[cond_df[target_col]==result[0]]

        if len(correct)!=0:
            temp_df = temp_df.drop(index=cond_df.index) 
            default_class = temp_df[target_col].value_counts().idxmax()
            total_error = (len(cond_df) - len(correct)) + len(temp_df[temp_df[target_col]!=default_class])
            error = {'default': len(temp_df[temp_df[target_col]!=default_class]), 'class':(len(cond_df) - len(correct))}
            rules.append([CARS, default_class, total_error, error])
            
    lowest_error_id = np.argmin([x[2] for x in rules])
    pruned_rules = rules[:lowest_error_id+1]
    return pruned_rules

In [363]:
pruned_rules = build_classifier(df, CARS_1, len_D, target_col)

In [353]:
def predict(df, rules):
    temp_df = df.copy()
    # Setting all to default class
    df['prediction'] = rules[-1][1]
    for rule in rules:
        cond, default_class, _, _ = rule
        cond, prediction = cond
        # Filtering rows that fulfil rule condition
        cond_df = temp_df.loc[(temp_df[list(cond)] == pd.Series(cond)).all(axis=1)]
        # Setting prediction
        df.loc[cond_df.index, 'prediction'] = prediction[0]
        # Removing rows that has been predicted
        temp_df = temp_df.drop(index=cond_df.index)
    
    return df

# Class based Implementation

In [452]:
# WIP
class Rule:
    
    def __init__(self, itemset, class_count, len_D):
        self.itemset = itemset
        self.class_count = class_count
        
        if len(set(class_count.values()))==1:
            self.result = random.choice(class_count.keys())
        else:
            self.result = max(class_count, key=lambda x: class_count[x])
        
        self.conf = float(class_count[self.result])/sum(class_count.values())
        self.sup = class_count[self.result] / float(len_D)

In [453]:
# WIP
class Rules:
    
    def __init__(self):
        self.rules = {}
        
    
    def get_rule(self, key):
        length = len(key)
        
        if length not in self.rules:
            return None
        else:
            return self.rules[length].get(key,None)
        
    def get_rules_by_length(self, length):
        if length not in self.rules:
            return []
        else:
            return self.rules[length].values()
    
    def get_itemset_by_length(self, length):
        if length not in self.rules:
            return set()
        else:
            return set(self.rules[length].keys())
    
    def get_itemset_rules_by_length(self, length):
        if length not in self.rules:
            return {}
        else:
            return self.rules[length]
    
    def add(self, rule):
        if rule==None:
            return
        
        length = len(rule.itemset)
        
        if length not in self.rules:
            self.rules[length] = rule
        else:
            self.rules[length][rule.itemset] = rule
    
    def remove(self, itemset):
        length = len(itemset)
        
        self.rules[length].pop(itemset)

In [488]:
class RuleGenerator:
    def __init__(self, min_sup=0.2, min_conf=0.6):
        self.min_sup = min_sup
        self.min_conf = min_conf
        self.CARS = None
    
    def generate_rules(self, df, target_col):
        len_D = len(df)
        class_labels = df[target_col].unique()
        variables = list(df.columns)
        variables.remove(target_col)
        k=1
        F_1 = {}
        for variable in variables:
            temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
            temp = temp.fillna(0)
        #     print(temp)
            temp.apply(lambda x: F_1.update(self.convert_to_rule_items(x, [variable], class_labels, len_D)), axis=1)

        self.CARS = {}
        self.CARS[1] = self.gen_rules(F_1, len_D)

        F = {}
        F[1] = F_1
        # print(F_1.keys())
        frequent_variables = set([x for x in F_1.keys()])
        # print(frequent_variables)
        while len(F[k])!=0:
            k+=1
            # Includes generating candidate itemset and finding frequent itemset
            F[k] = self.gen_itemset(F[k-1], df, target_col, frequent_variables)
            frequent_variables = set([x for x in F[k].keys()])

            # Build rules from frequent itemset
            self.CARS[k] = self.gen_rules(F[k], len_D)
            # Pruning rules
            self.CARS[k] = self.prune_rules(self.CARS[k], self.CARS[k-1])
        
    def convert_to_rule_items(self, x, variables, class_label, len_D):
        condset = {}
        for variable in variables:
            condset[variable] = x[variable]

        condset = frozendict(condset)
        class_count = {}
        # When label don't exist in group
        for label in class_label:
            if label not in x:
                class_count[label] = 0
            else:
                class_count[label] = x[label]
        major_class = max(class_count, key=lambda x: class_count[x])

        # Removing non frequent itemset
        if (class_count[major_class]/float(len_D))<self.min_sup:
            return {}

        return {condset: (major_class, class_count)}

    def gen_rules(self, F_k, len_D):

        rules = {}

        for condset, (major_class, class_count) in F_k.items():

            conf = class_count[major_class] / float(sum(class_count.values()))

            if conf>self.min_conf:
                # Checking if the support are the same for all classes
                if set(class_count.values())==1:
                    # Choose a random class
                    major_class = random.choice(class_count.keys())

                rules[condset] = (major_class, class_count)

        return rules
    
    
    def gen_itemset(self, F, df, target_col, itemsets):
        F_new = {}
        itemsets = set(itemsets)

        condset = list(F.keys())

        for i in range(len(condset)):
            cond = condset[i]
            temp = df.copy()
            cols = []
            for item in cond.items():
                value = item[1]
                col = item[0]
                temp = temp[temp[col]==value]
                cols.append(col)

            for j in range(i+1, len(condset)):
                itemset = condset[j]
                #Line 20 to 25 is the Apriori principle, where we merge 2 frequent superset into a candidate key
                # Checking if 2 itemsets differ only by 2 conditions
                itemset_keys = itemset.keys()
                cond_keys = cond.keys()
                if len(set(itemset_keys) - set(cond_keys))==1 and len(set(itemset.items())^set(cond.items()))==2:
                    variable = (set(itemset_keys) - set(cond_keys)).pop()
                    value = itemset.get(variable)

                    # Candidate generation
                    temp_2 = temp[temp[variable]==value]
                    groupby = cols+[variable, target_col]

                    # Calculating frequency
                    # TODO: Can be optimize further as we already filtered out the candidate, hence only need groupby class but this will affect convert_to_rule_items
                    group = temp_2.groupby(by=groupby).size().unstack(level=-1).reset_index()
                    group = group.fillna(0)
                    # Converting candidates in frequent itemset
                    group.apply(lambda x: F_new.update(convert_to_rule_items(x, groupby[:-1], class_labels, min_sup, len_D)), axis=1)


        return F_new
    
    
    
    def prune_rules(self, r, r_):
        # r and r_ is a python dict
        to_remove = []
        for superset in r.keys():
            # superset is a frozendict
            superset_set = set(superset.items())
            superset_value = r[superset]

            for subset in r_.keys():
                # subset is a frozendict
                subset_set = set(subset.items())

                if subset_set<superset_set:
                    subset_value = r_[subset]

                    superset_class = superset_value[0]
                    superset_error = superset_value[1][superset_class] / sum(superset_value[1].values())

                    subset_class = subset_value[0]
                    subset_error = subset_value[1][subset_class] / sum(subset_value[1].values())

                    if superset_error>=subset_error:
                        to_remove.append(superset)
                        # break to as the superset has already been removed, no further testing is needed
                        break

        for key in to_remove:
            r.pop(key)
        return r

In [489]:
rule_gen = RuleGenerator()

In [490]:
%%time
rule_gen.generate_rules(df, 'class')

CPU times: total: 2.17 s
Wall time: 2.17 s


In [558]:
class Classifier:
    
    def __init__(self, rule_builder):
        self.rule_builder = rule_builder
        self.rules = None
        self.sorted_CARS = None
        
    def sort_rules(self, len_D):
        sorted_CARS = []
        for values in self.rule_builder.CARS.values():
            sorted_CARS.extend(list(values.items()))
        self.sorted_CARS = sorted(sorted_CARS, key=lambda x: (x[1][1][x[1][0]] / sum(x[1][1].values()), x[1][1][x[1][0]]/len_D, len(x[0])), reverse=True)
    
    def build_classifier(self, df, target_col):
        len_D = len(df)
        self.sort_rules(len_D)
        temp_df = df
        rules = []
        for CARS in self.sorted_CARS:
            cond, result = CARS
            cond_df = temp_df.loc[(temp_df[list(cond)] == pd.Series(cond)).all(axis=1)]
            correct = cond_df[cond_df[target_col]==result[0]]

            if len(correct)!=0:
                temp_df = temp_df.drop(index=cond_df.index) 
                if len(temp_df)==0:
                    default_class = random.choice(df[target_col].unique())
                else:
                    default_class = temp_df[target_col].value_counts().idxmax()
                total_error = (len(cond_df) - len(correct)) + len(temp_df[temp_df[target_col]!=default_class])
                error = {'default': len(temp_df[temp_df[target_col]!=default_class]), 'class':(len(cond_df) - len(correct))}
                rules.append([CARS, default_class, total_error, error])

        lowest_error_id = np.argmin([x[2] for x in rules])
        pruned_rules = rules[:lowest_error_id+1]
        self.rules = pruned_rules
        
    def predict(self, df):
        temp_df = df.copy()
        ans = df.copy()
        # Setting all to default class
        ans['prediction'] = rules[-1][1]
        for rule in self.rules:
            cond, default_class, _, _ = rule
            cond, prediction = cond
            # Filtering rows that fulfil rule condition
            cond_df = temp_df.loc[(temp_df[list(cond)] == pd.Series(cond)).all(axis=1)]
            # Setting prediction
            ans.loc[cond_df.index, 'prediction'] = prediction[0]
            # Removing rows that has been predicted
            temp_df = temp_df.drop(index=cond_df.index)

        return ans
        

In [492]:
classifier = Classifier(rule_gen)

In [493]:
classifier.build_classifier(df, 'class')

In [495]:
ans = classifier.predict(df)

In [496]:
ans['correct'] = (ans['class']==ans['prediction'])
ans['correct'].value_counts()

True     651
False     32
Name: correct, dtype: int64

# Weighted RuleGenerator

In [537]:
class RuleGenerator:
    def __init__(self, min_sup=0.2, min_conf=0.6, weighted=False):
        self.min_sup = min_sup
        self.min_conf = min_conf
        self.CARS = None
        self.weighted = weighted
    
    def generate_rules(self, df, target_col):
        len_D = len(df)
        class_labels = df[target_col].unique()
        variables = list(df.columns)
        variables.remove(target_col)
        k=1
        F_1 = {}
        for variable in variables:
            temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
            temp = temp.fillna(0)
        #     print(temp)
            temp.apply(lambda x: F_1.update(self.convert_to_rule_items(x, [variable], class_labels, len_D)), axis=1)

        self.CARS = {}
        self.CARS[1] = self.gen_rules(F_1, len_D)

        F = {}
        F[1] = F_1
        # print(F_1.keys())
        frequent_variables = set([x for x in F_1.keys()])
        # print(frequent_variables)
        while len(F[k])!=0:
            k+=1
            # Includes generating candidate itemset and finding frequent itemset
            F[k] = self.gen_itemset(F[k-1], df, target_col, frequent_variables)
            frequent_variables = set([x for x in F[k].keys()])

            # Build rules from frequent itemset
            self.CARS[k] = self.gen_rules(F[k], len_D)
            # Pruning rules
            self.CARS[k] = self.prune_rules(self.CARS[k], self.CARS[k-1])
        
    def convert_to_rule_items(self, x, variables, class_label, len_D):
        condset = {}
        for variable in variables:
            condset[variable] = x[variable]

        condset = frozendict(condset)
        class_count = {}
        # When label don't exist in group
        for label in class_label:
            if label not in x:
                class_count[label] = 0
            else:
                class_count[label] = x[label]
        major_class = max(class_count, key=lambda x: class_count[x])

        # Removing non frequent itemset
#         if (class_count[major_class]/float(len_D))<self.min_sup:
#             return {}

        if self.weighted:
            for label in class_label:
                if (class_count[label] / float(len_D)) >= self.min_sup[label]:
                    return {condset: (major_class, class_count)}
        else:
            if (class_count[major_class]/float(len_D))>=self.min_sup:
                return {condset: (major_class, class_count)}

        return {}

    def gen_rules(self, F_k, len_D):

        rules = {}

        for condset, (major_class, class_count) in F_k.items():

            conf = class_count[major_class] / float(sum(class_count.values()))

            if conf>self.min_conf:
                # Checking if the support are the same for all classes
                if set(class_count.values())==1:
                    # Choose a random class
                    major_class = random.choice(class_count.keys())

                rules[condset] = (major_class, class_count)

        return rules
    
    
    def gen_itemset(self, F, df, target_col, itemsets):
        F_new = {}
        itemsets = set(itemsets)

        condset = list(F.keys())

        for i in range(len(condset)):
            cond = condset[i]
            temp = df.copy()
            cols = []
            for item in cond.items():
                value = item[1]
                col = item[0]
                temp = temp[temp[col]==value]
                cols.append(col)

            for j in range(i+1, len(condset)):
                itemset = condset[j]
                #Line 20 to 25 is the Apriori principle, where we merge 2 frequent superset into a candidate key
                # Checking if 2 itemsets differ only by 2 conditions
                itemset_keys = itemset.keys()
                cond_keys = cond.keys()
                if len(set(itemset_keys) - set(cond_keys))==1 and len(set(itemset.items())^set(cond.items()))==2:
                    variable = (set(itemset_keys) - set(cond_keys)).pop()
                    value = itemset.get(variable)

                    # Candidate generation
                    temp_2 = temp[temp[variable]==value]
                    groupby = cols+[variable, target_col]

                    # Calculating frequency
                    # TODO: Can be optimize further as we already filtered out the candidate, hence only need groupby class but this will affect convert_to_rule_items
                    group = temp_2.groupby(by=groupby).size().unstack(level=-1).reset_index()
                    group = group.fillna(0)
                    # Converting candidates in frequent itemset
                    group.apply(lambda x: F_new.update(convert_to_rule_items(x, groupby[:-1], class_labels, min_sup, len_D)), axis=1)


        return F_new
    
    
    
    def prune_rules(self, r, r_):
        # r and r_ is a python dict
        to_remove = []
        for superset in r.keys():
            # superset is a frozendict
            superset_set = set(superset.items())
            superset_value = r[superset]

            for subset in r_.keys():
                # subset is a frozendict
                subset_set = set(subset.items())

                if subset_set<superset_set:
                    subset_value = r_[subset]

                    superset_class = superset_value[0]
                    superset_error = superset_value[1][superset_class] / sum(superset_value[1].values())

                    subset_class = subset_value[0]
                    subset_error = subset_value[1][subset_class] / sum(subset_value[1].values())

                    if superset_error>=subset_error:
                        to_remove.append(superset)
                        # break to as the superset has already been removed, no further testing is needed
                        break

        for key in to_remove:
            r.pop(key)
        return r

In [565]:
rule_gen = RuleGenerator(min_sup={2:0.35, 4:0.04},min_conf=0.7, weighted=True)

In [566]:
%%time
rule_gen.generate_rules(df, 'class')

CPU times: total: 3 s
Wall time: 2.99 s


In [567]:
classifier = Classifier(rule_gen)

In [568]:
classifier.build_classifier(df, 'class')

In [569]:
ans = classifier.predict(df)

In [570]:
ans['correct'] = (ans['class']==ans['prediction'])
ans['correct'].value_counts()

True     659
False     24
Name: correct, dtype: int64

In [571]:
classifier.rules

[[(frozendict.frozendict({'uniformity_of_cell_size': 1, 'bare_nuclei': 1, 'mitoses': 1}),
   (2, {2: 331, 4: 0})),
  4,
  113,
  {'default': 113, 'class': 0}],
 [(frozendict.frozendict({'uniformity_of_cell_size': 1, 'bare_nuclei': 1, 'normal_nucleoli': 1}),
   (2, {2: 316, 4: 0})),
  4,
  108,
  {'default': 108, 'class': 0}],
 [(frozendict.frozendict({'single_epithelial_cell_size': 2, 'bare_nuclei': 1, 'mitoses': 1}),
   (2, {2: 313, 4: 0})),
  4,
  75,
  {'default': 75, 'class': 0}],
 [(frozendict.frozendict({'uniformity_of_cell_shape': 1, 'bare_nuclei': 1, 'mitoses': 1}),
   (2, {2: 307, 4: 0})),
  4,
  70,
  {'default': 70, 'class': 0}],
 [(frozendict.frozendict({'marginal_adhension': 1, 'bare_nuclei': 1, 'normal_nucleoli': 1}),
   (2, {2: 305, 4: 0})),
  4,
  65,
  {'default': 65, 'class': 0}],
 [(frozendict.frozendict({'uniformity_of_cell_size': 1, 'marginal_adhension': 1, 'normal_nucleoli': 1}),
   (2, {2: 299, 4: 0})),
  4,
  43,
  {'default': 43, 'class': 0}],
 [(frozendict.fro