In [78]:
# !pip install frozendict

Collecting frozendict
  Downloading frozendict-2.3.4-cp39-cp39-win_amd64.whl (35 kB)
Installing collected packages: frozendict
Successfully installed frozendict-2.3.4


In [9]:
import pandas as pd
import numpy as np
from frozendict import frozendict
import random
from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv('data/breast_w/clean_breast_w.csv', index_col=0)

In [11]:
df.head()

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhension,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [12]:
variables = df.columns[:-1]
target_col = df.columns[-1]
class_labels = df['class'].unique()

In [13]:
target_col

'class'

In [14]:
class_labels

array([2, 4], dtype=int64)

# Rule Generator (contains duplicate conditions and no pruning)

In [15]:
def convert_to_rule_items(x, variables, class_label, min_sup, len_D):
#     print(x)
    condset = tuple([(variable, x[variable]) for variable in variables])
    
    class_count = {}
#     When label don't exist in group
    for label in class_label:
        if label not in x:
            class_count[label] = 0
        else:
            class_count[label] = x[label]
#     print(class_count)
#     class_count = x[class_label].to_dict()
    condsup_count = sum(class_count.values())
    # Removing non frequent itemset
    if (condsup_count/float(len_D))<min_sup:
#         print('Pruning')
        return {}
#     print({condset: [condsup_count, class_count]})
    return {condset: (condsup_count, class_count)}

In [16]:
# RG Line 2 genRules(F_k)

def gen_rules(F_k, len_D, min_conf):
    
    rules = []
    
    for condset, (condsup_count, class_count) in F_k.items():
        major_class = max(class_count, key=lambda x: class_count[x])
        
        conf = class_count[major_class] / float(condsup_count)
        
        if conf>min_conf:
            # Checking if the support are the same for all classes
            if set(class_count.values())==1:
                # Choose a random class
                major_class = random.choice(class_count.keys())
                
            rules.append((condset,major_class))
            
    return rules

In [17]:
def gen_itemset(F, df, target_col, variables):
    F_new = {}
    variables = set(variables)
    for condset in F.keys():
#         print(f'New Set: {condset}')
        temp = df.copy()
        cols = []
        for item in condset:
            value = item[1]
            col = item[0]
            temp = temp[temp[col]==value]
            
            # Ensure that subsequent itemset generation do not generate same itemset
            # This 1 line cuts down a lot of permutations of the same itemset
#             df = df[df[col]!=value]
            cols.append(col)
        remaining = variables - set(cols)
        # TODO: Currently there is both candidate generation
        for variable in remaining:
            # Candidate generation
            groupby = cols+[variable, target_col]
            
            # Calculating frequency
            group = temp.groupby(by=groupby).size().unstack(level=-1).reset_index()
            group = group.fillna(0)
            
            # Converting candidates in frequent itemset
            group.apply(lambda x: F_new.update(convert_to_rule_items(x, groupby[:-1], class_labels, min_sup, len_D)), axis=1)


    return F_new

In [18]:
%%time
# RG Line 1
# Format of F_k: 
# key = itemset
# value = [no. of appearance of itemset, dictionary with key as class_label and value as no. of class_label in itemset]

min_sup=0.2
min_conf=0.7
len_D = len(df)

k=1
F_1 = {}
for variable in variables:
    temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
    temp = temp.fillna(0)
#     print(temp)
    temp.apply(lambda x: F_1.update(convert_to_rule_items(x, [variable], class_labels, min_sup, len_D)), axis=1)

CARS = {}
CARS[1] = gen_rules(F_1, len_D, min_conf)

F = F_1
print(F)
# print(F_1.keys())
# print(frequent_variables)
while len(F)!=0:
    k+=1
    # Includes generating candidate itemset and finding frequent itemset
    F = gen_itemset(F, df, target_col, variables)
    print(F)
    # Build rules from frequent itemset
    CARS[k] = gen_rules(F, len_D, min_conf)
    

{(('clump_thickness', 1.0),): (139.0, {2: 136.0, 4: 3.0}), (('uniformity_of_cell_size', 1.0),): (373.0, {2: 369.0, 4: 4.0}), (('uniformity_of_cell_shape', 1.0),): (346.0, {2: 344.0, 4: 2.0}), (('marginal_adhension', 1.0),): (393.0, {2: 363.0, 4: 30.0}), (('single_epithelial_cell_size', 2.0),): (376.0, {2: 355.0, 4: 21.0}), (('bare_nuclei', 1.0),): (402.0, {2: 387.0, 4: 15.0}), (('bland_chromatin', 1.0),): (150.0, {2: 148.0, 4: 2.0}), (('bland_chromatin', 2.0),): (160.0, {2: 153.0, 4: 7.0}), (('bland_chromatin', 3.0),): (161.0, {2: 125.0, 4: 36.0}), (('normal_nucleoli', 1.0),): (432.0, {2: 391.0, 4: 41.0}), (('mitoses', 1.0),): (563.0, {2: 431.0, 4: 132.0})}
{(('uniformity_of_cell_size', 1.0), ('mitoses', 1.0)): (366.0, {2: 363.0, 4: 3.0}), (('uniformity_of_cell_size', 1.0), ('bare_nuclei', 1.0)): (336.0, {2: 336.0, 4: 0.0}), (('uniformity_of_cell_size', 1.0), ('marginal_adhension', 1.0)): (322.0, {2: 320.0, 4: 2.0}), (('uniformity_of_cell_size', 1.0), ('uniformity_of_cell_shape', 1.0))

{(('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1)): (289, {2: 289, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('uniformity_of_cell_shape', 1)): (288, {2: 288, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('normal_nucleoli', 1)): (311, {2: 311, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('single_epithelial_cell_size', 2)): (280, {2: 280, 4: 0}), (('uniformity_of_cell_size', 1.0), ('mitoses', 1.0), ('marginal_adhension', 1.0), ('bare_nuclei', 1.0)): (289.0, {2: 289.0, 4: 0.0}), (('uniformity_of_cell_size', 1.0), ('mitoses', 1.0), ('marginal_adhension', 1.0), ('uniformity_of_cell_shape', 1.0)): (278.0, {2: 277.0, 4: 1.0}), (('uniformity_of_cell_size', 1.0), ('mitoses', 1.0), ('marginal_adhension', 1.0), ('normal_nucleoli', 1.0)): (294.0, {2: 294.0, 4: 0.0}), (('uniformity_of_cell_size', 1.0), ('mitoses', 1.0), ('marginal_adhension', 1.0), ('sing

{(('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1)): (253, {2: 253, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('normal_nucleoli', 1)): (272, {2: 272, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('single_epithelial_cell_size', 2)): (248, {2: 248, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('uniformity_of_cell_shape', 1), ('marginal_adhension', 1)): (253, {2: 253, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('uniformity_of_cell_shape', 1), ('normal_nucleoli', 1)): (269, {2: 269, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('uniformity_of_cell_shape', 1), ('single_epithelial_cell_size', 2)): (238, {2: 238, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('normal_nuc

{(('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1), ('single_epithelial_cell_size', 2)): (213, {2: 213, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1), ('normal_nucleoli', 1)): (237, {2: 237, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('normal_nucleoli', 1), ('single_epithelial_cell_size', 2)): (233, {2: 233, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('normal_nucleoli', 1), ('uniformity_of_cell_shape', 1)): (237, {2: 237, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('single_epithelial_cell_size', 2), ('uniformity_of_cell_shape', 1)): (213, {2: 213, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhensi

{(('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1), ('single_epithelial_cell_size', 2), ('normal_nucleoli', 1)): (199, {2: 199, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('uniformity_of_cell_shape', 1), ('normal_nucleoli', 1), ('single_epithelial_cell_size', 2)): (199, {2: 199, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('normal_nucleoli', 1), ('single_epithelial_cell_size', 2), ('uniformity_of_cell_shape', 1)): (199, {2: 199, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('normal_nucleoli', 1), ('uniformity_of_cell_shape', 1), ('single_epithelial_cell_size', 2)): (199, {2: 199, 4: 0}), (('uniformity_of_cell_size', 1), ('mitoses', 1), ('bare_nuclei', 1), ('marginal_adhension', 1), ('single_epithelial_cell_size', 2), ('uniformity_of

{}
CPU times: total: 2min 34s
Wall time: 2min 34s


In [19]:
CARS

{1: [((('clump_thickness', 1.0),), 2),
  ((('uniformity_of_cell_size', 1.0),), 2),
  ((('uniformity_of_cell_shape', 1.0),), 2),
  ((('marginal_adhension', 1.0),), 2),
  ((('single_epithelial_cell_size', 2.0),), 2),
  ((('bare_nuclei', 1.0),), 2),
  ((('bland_chromatin', 1.0),), 2),
  ((('bland_chromatin', 2.0),), 2),
  ((('bland_chromatin', 3.0),), 2),
  ((('normal_nucleoli', 1.0),), 2),
  ((('mitoses', 1.0),), 2)],
 2: [((('uniformity_of_cell_size', 1.0), ('mitoses', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('bare_nuclei', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('marginal_adhension', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('uniformity_of_cell_shape', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('normal_nucleoli', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('single_epithelial_cell_size', 2.0)),
   2),
  ((('uniformity_of_cell_shape', 1.0), ('mitoses', 1.0)), 2),
  ((('uniformity_of_cell_shape', 1.0), ('bare_nuclei', 1.0)), 2),
  ((('uniformity_of_

# Rule Generator (no duplicate set of conditions & includes pruning)

Weights can be included to balance classes

In [20]:
def convert_to_rule_items(x, variables, class_label, min_sup, len_D):
#     print(x)
    condset = {}
    for variable in variables:
        condset[variable] = x[variable]
        
    condset = frozendict(condset)
    class_count = {}
#     When label don't exist in group
    for label in class_label:
        if label not in x:
            class_count[label] = 0
        else:
            class_count[label] = x[label]
#     print(class_count)
#     class_count = x[class_label].to_dict()
    major_class = max(class_count, key=lambda x: class_count[x])
    
    # Removing non frequent itemset
    if (class_count[major_class]/float(len_D))<min_sup:
        return {}
#     print({condset: [condsup_count, class_count]})
    return {condset: (major_class, class_count)}

In [21]:
# RG Line 2 genRules(F_k)

def gen_rules(F_k, len_D, min_conf):
    
    rules = {}
    
    for condset, (major_class, class_count) in F_k.items():
        
        conf = class_count[major_class] / float(sum(class_count.values()))
        
        if conf>min_conf:
            # Checking if the support are the same for all classes
            if set(class_count.values())==1:
                # Choose a random class
                major_class = random.choice(class_count.keys())
                
            rules[condset] = (major_class, class_count)
            
    return rules

In [22]:
def gen_itemset(F, df, target_col, itemsets):
    F_new = {}
    itemsets = set(itemsets)
    
    condset = list(F.keys())
    
#     for cond in condset
    for i in range(len(condset)):
        cond = condset[i]
#         print(f'New Set: {condset}')
        temp = df.copy()
        cols = []
        for item in cond.items():
            value = item[1]
            col = item[0]
            temp = temp[temp[col]==value]
            cols.append(col)

#         for itemset in condset
        for j in range(i+1, len(condset)):
            itemset = condset[j]
            #Line 20 to 25 is the Apriori principle, where we merge 2 frequent superset into a candidate key
            # Checking if 2 itemsets differ only by 2 conditions
            itemset_keys = itemset.keys()
            cond_keys = cond.keys()
            if len(set(itemset_keys) - set(cond_keys))==1 and len(set(itemset.items())^set(cond.items()))==2:
                variable = (set(itemset_keys) - set(cond_keys)).pop()
#                 print(itemset)
                value = itemset.get(variable)
                    
                # Candidate generation
                temp_2 = temp[temp[variable]==value]
                groupby = cols+[variable, target_col]

                # Calculating frequency
                # TODO: Can be optimize further as we already filtered out the candidate, hence only need groupby class but this will affect convert_to_rule_items
                group = temp_2.groupby(by=groupby).size().unstack(level=-1).reset_index()
                group = group.fillna(0)
                # Converting candidates in frequent itemset
                group.apply(lambda x: F_new.update(convert_to_rule_items(x, groupby[:-1], class_labels, min_sup, len_D)), axis=1)


    return F_new

In [23]:
def prune_rules(r, r_):
    # r and r_ is a python dict
    to_remove = []
    for superset in r.keys():
        # superset is a frozendict
        superset_set = set(superset.items())
        superset_value = r[superset]
        
        for subset in r_.keys():
            # subset is a frozendict
            subset_set = set(subset.items())
            
            if subset_set<superset_set:
                subset_value = r_[subset]
                
                superset_class = superset_value[0]
                superset_error = superset_value[1][superset_class] / sum(superset_value[1].values())
                
                subset_class = subset_value[0]
                subset_error = subset_value[1][subset_class] / sum(subset_value[1].values())
                
                if superset_error>=subset_error:
                    to_remove.append(superset)
                    # break to as the superset has already been removed, no further testing is needed
                    break
#     print(r)
#     print(to_remove)
    for key in to_remove:
        r.pop(key)
    return r

In [24]:
%%time
# RG Line 1
# Format of F_k: 
# key = itemset
# value = [no. of appearance of itemset, dictionary with key as class_label and value as no. of class_label in itemset]

min_sup=0.2
min_conf=0.7
len_D = len(df)

k=1
F_1 = {}
for variable in variables:
    temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
    temp = temp.fillna(0)
#     print(temp)
    temp.apply(lambda x: F_1.update(convert_to_rule_items(x, [variable], class_labels, min_sup, len_D)), axis=1)

CARS_1 = {}
CARS_1[1] = gen_rules(F_1, len_D, min_conf)

F = {}
F[1] = F_1
# print(F_1.keys())
frequent_variables = set([x for x in F_1.keys()])
# print(frequent_variables)
while len(F[k])!=0:
    k+=1
    # Includes generating candidate itemset and finding frequent itemset
    F[k] = gen_itemset(F[k-1], df, target_col, frequent_variables)
#     print(F)
    frequent_variables = set([x for x in F[k].keys()])
    
    # Build rules from frequent itemset
#     CARS_1.update(gen_rules(F, len_D, min_conf))
    CARS_1[k] = gen_rules(F[k], len_D, min_conf)
#     print(len(CARS_1[k]))
    # Pruning rules
    CARS_1[k] = prune_rules(CARS_1[k], CARS_1[k-1])
#     print(len(CARS_1[k]))

CPU times: total: 2.2 s
Wall time: 2.2 s


In [25]:
F

{1: {frozendict.frozendict({'uniformity_of_cell_size': 1.0}): (2,
   {2: 369.0, 4: 4.0}),
  frozendict.frozendict({'uniformity_of_cell_shape': 1.0}): (2,
   {2: 344.0, 4: 2.0}),
  frozendict.frozendict({'marginal_adhension': 1.0}): (2, {2: 363.0, 4: 30.0}),
  frozendict.frozendict({'single_epithelial_cell_size': 2.0}): (2,
   {2: 355.0, 4: 21.0}),
  frozendict.frozendict({'bare_nuclei': 1.0}): (2, {2: 387.0, 4: 15.0}),
  frozendict.frozendict({'bland_chromatin': 1.0}): (2, {2: 148.0, 4: 2.0}),
  frozendict.frozendict({'bland_chromatin': 2.0}): (2, {2: 153.0, 4: 7.0}),
  frozendict.frozendict({'normal_nucleoli': 1.0}): (2, {2: 391.0, 4: 41.0}),
  frozendict.frozendict({'mitoses': 1.0}): (2, {2: 431.0, 4: 132.0})},
 2: {frozendict.frozendict({'uniformity_of_cell_size': 1, 'uniformity_of_cell_shape': 1}): (2,
   {2: 322, 4: 2}),
  frozendict.frozendict({'uniformity_of_cell_size': 1, 'marginal_adhension': 1}): (2,
   {2: 320, 4: 2}),
  frozendict.frozendict({'uniformity_of_cell_size': 1, '

In [26]:
CARS_1

{1: {frozendict.frozendict({'uniformity_of_cell_size': 1.0}): (2,
   {2: 369.0, 4: 4.0}),
  frozendict.frozendict({'uniformity_of_cell_shape': 1.0}): (2,
   {2: 344.0, 4: 2.0}),
  frozendict.frozendict({'marginal_adhension': 1.0}): (2, {2: 363.0, 4: 30.0}),
  frozendict.frozendict({'single_epithelial_cell_size': 2.0}): (2,
   {2: 355.0, 4: 21.0}),
  frozendict.frozendict({'bare_nuclei': 1.0}): (2, {2: 387.0, 4: 15.0}),
  frozendict.frozendict({'bland_chromatin': 1.0}): (2, {2: 148.0, 4: 2.0}),
  frozendict.frozendict({'bland_chromatin': 2.0}): (2, {2: 153.0, 4: 7.0}),
  frozendict.frozendict({'normal_nucleoli': 1.0}): (2, {2: 391.0, 4: 41.0}),
  frozendict.frozendict({'mitoses': 1.0}): (2, {2: 431.0, 4: 132.0})},
 2: {},
 3: {frozendict.frozendict({'uniformity_of_cell_size': 1, 'uniformity_of_cell_shape': 1, 'marginal_adhension': 1}): (2,
   {2: 282, 4: 1}),
  frozendict.frozendict({'uniformity_of_cell_size': 1, 'uniformity_of_cell_shape': 1, 'single_epithelial_cell_size': 2}): (2,
   

# Classifier

In [27]:
def sort_rules(CARS, len_D):
    sorted_CARS = []
    for values in CARS.values():
        sorted_CARS.extend(list(values.items()))
#     print(temp)
    sorted_CARS = sorted(sorted_CARS, key=lambda x: (x[1][1][x[1][0]] / sum(x[1][1].values()), x[1][1][x[1][0]]/len_D, len(x[0])), reverse=True)
#     print(temp)
    return sorted_CARS

In [28]:
def build_classifier(df, CARS, len_D, target_col):
    sorted_CARS = sort_rules(CARS, len_D)
    temp_df = df
    rules = []
    for CARS in sorted_CARS:
        cond, result = CARS
        cond_df = temp_df.loc[(temp_df[list(cond)] == pd.Series(cond)).all(axis=1)]
        correct = cond_df[cond_df[target_col]==result[0]]

        if len(correct)!=0:
            temp_df = temp_df.drop(index=cond_df.index) 
            default_class = temp_df[target_col].value_counts().idxmax()
            total_error = (len(cond_df) - len(correct)) + len(temp_df[temp_df[target_col]!=default_class])
            error = {'default': len(temp_df[temp_df[target_col]!=default_class]), 'class':(len(cond_df) - len(correct))}
            rules.append([CARS, default_class, total_error, error])
            
    lowest_error_id = np.argmin([x[2] for x in rules])
    pruned_rules = rules[:lowest_error_id+1]
    return pruned_rules

In [29]:
pruned_rules = build_classifier(df, CARS_1, len_D, target_col)

In [30]:
def predict(df, rules):
    temp_df = df.copy()
    # Setting all to default class
    df['prediction'] = rules[-1][1]
    for rule in rules:
        cond, default_class, _, _ = rule
        cond, prediction = cond
        # Filtering rows that fulfil rule condition
        cond_df = temp_df.loc[(temp_df[list(cond)] == pd.Series(cond)).all(axis=1)]
        # Setting prediction
        df.loc[cond_df.index, 'prediction'] = prediction[0]
        # Removing rows that has been predicted
        temp_df = temp_df.drop(index=cond_df.index)
    
    return df

# Class based Implementation

In [31]:
# WIP
class Rule:
    
    def __init__(self, itemset, class_count, len_D):
        self.itemset = itemset
        self.class_count = class_count
        
        if len(set(class_count.values()))==1:
            self.result = random.choice(class_count.keys())
        else:
            self.result = max(class_count, key=lambda x: class_count[x])
        
        self.conf = float(class_count[self.result])/sum(class_count.values())
        self.sup = class_count[self.result] / float(len_D)

In [32]:
# WIP
class Rules:
    
    def __init__(self):
        self.rules = {}
        
    
    def get_rule(self, key):
        length = len(key)
        
        if length not in self.rules:
            return None
        else:
            return self.rules[length].get(key,None)
        
    def get_rules_by_length(self, length):
        if length not in self.rules:
            return []
        else:
            return self.rules[length].values()
    
    def get_itemset_by_length(self, length):
        if length not in self.rules:
            return set()
        else:
            return set(self.rules[length].keys())
    
    def get_itemset_rules_by_length(self, length):
        if length not in self.rules:
            return {}
        else:
            return self.rules[length]
    
    def add(self, rule):
        if rule==None:
            return
        
        length = len(rule.itemset)
        
        if length not in self.rules:
            self.rules[length] = rule
        else:
            self.rules[length][rule.itemset] = rule
    
    def remove(self, itemset):
        length = len(itemset)
        
        self.rules[length].pop(itemset)

In [66]:
class RuleGenerator:
    def __init__(self, min_sup=0.2, min_conf=0.6):
        self.min_sup = min_sup
        self.min_conf = min_conf
        self.CARS = None
    
    def generate_rules(self, df, target_col):
        len_D = len(df)
        class_labels = df[target_col].unique()
        variables = list(df.columns)
        variables.remove(target_col)
        k=1
        F_1 = {}
        for variable in variables:
            temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
            temp = temp.fillna(0)
        #     print(temp)
            temp.apply(lambda x: F_1.update(self.convert_to_rule_items(x, [variable], class_labels, len_D)), axis=1)

        self.CARS = {}
        self.CARS[1] = self.gen_rules(F_1, len_D)

        F = {}
        F[1] = F_1
        # print(F_1.keys())
        frequent_variables = set([x for x in F_1.keys()])
        # print(frequent_variables)
        while len(F[k])!=0:
            k+=1
            # Includes generating candidate itemset and finding frequent itemset
            F[k] = self.gen_itemset(F[k-1], df, target_col, frequent_variables)
            frequent_variables = set([x for x in F[k].keys()])

            # Build rules from frequent itemset
            self.CARS[k] = self.gen_rules(F[k], len_D)
            # Pruning rules
            self.CARS[k] = self.prune_rules(self.CARS[k], self.CARS[k-1])
        
    def convert_to_rule_items(self, x, variables, class_label, len_D):
        condset = {}
        for variable in variables:
            condset[variable] = x[variable]

        condset = frozendict(condset)
        class_count = {}
        # When label don't exist in group
        for label in class_label:
            if label not in x:
                class_count[label] = 0
            else:
                class_count[label] = x[label]
        major_class = max(class_count, key=lambda x: class_count[x])

        # Removing non frequent itemset
        if (class_count[major_class]/float(len_D))<self.min_sup:
            return {}

        return {condset: (major_class, class_count)}

    def gen_rules(self, F_k, len_D):

        rules = {}

        for condset, (major_class, class_count) in F_k.items():

            conf = class_count[major_class] / float(sum(class_count.values()))

            if conf>self.min_conf:
                # Checking if the support are the same for all classes
                if set(class_count.values())==1:
                    # Choose a random class
                    major_class = random.choice(class_count.keys())

                rules[condset] = (major_class, class_count)

        return rules
    
    
    def gen_itemset(self, F, df, target_col, itemsets):
        F_new = {}
        itemsets = set(itemsets)

        condset = list(F.keys())

        for i in range(len(condset)):
            cond = condset[i]
            temp = df.copy()
            cols = []
            for item in cond.items():
                value = item[1]
                col = item[0]
                temp = temp[temp[col]==value]
                cols.append(col)

            for j in range(i+1, len(condset)):
                itemset = condset[j]
                #Line 20 to 25 is the Apriori principle, where we merge 2 frequent superset into a candidate key
                # Checking if 2 itemsets differ only by 2 conditions
                itemset_keys = itemset.keys()
                cond_keys = cond.keys()
                if len(set(itemset_keys) - set(cond_keys))==1 and len(set(itemset.items())^set(cond.items()))==2:
                    variable = (set(itemset_keys) - set(cond_keys)).pop()
                    value = itemset.get(variable)

                    # Candidate generation
                    temp_2 = temp[temp[variable]==value]
                    groupby = cols+[variable, target_col]

                    # Calculating frequency
                    # TODO: Can be optimize further as we already filtered out the candidate, hence only need groupby class but this will affect convert_to_rule_items
                    group = temp_2.groupby(by=groupby).size().unstack(level=-1).reset_index()
                    group = group.fillna(0)
                    # Converting candidates in frequent itemset
                    group.apply(lambda x: F_new.update(convert_to_rule_items(x, groupby[:-1], class_labels, min_sup, len_D)), axis=1)


        return F_new
    
    
    
    def prune_rules(self, r, r_):
        # r and r_ is a python dict
        to_remove = []
        for superset in r.keys():
            # superset is a frozendict
            superset_set = set(superset.items())
            superset_value = r[superset]

            for subset in r_.keys():
                # subset is a frozendict
                subset_set = set(subset.items())

                if subset_set<superset_set:
                    subset_value = r_[subset]

                    superset_class = superset_value[0]
                    superset_error = superset_value[1][superset_class] / sum(superset_value[1].values())

                    subset_class = subset_value[0]
                    subset_error = subset_value[1][subset_class] / sum(subset_value[1].values())

                    if superset_error>=subset_error:
                        to_remove.append(superset)
                        # break to as the superset has already been removed, no further testing is needed
                        break

        for key in to_remove:
            r.pop(key)
        return r

In [67]:
rule_gen = RuleGenerator()

In [68]:
%%time
rule_gen.generate_rules(df, 'type')

CPU times: total: 609 ms
Wall time: 608 ms


In [177]:
import pandas as pd
import numpy as np
from frozendict import frozendict
import random


class Classifier:

    def __init__(self, rule_builder):
        self.rule_builder = rule_builder
        self.rules = None
        self.sorted_CARS = None

    def sort_rules(self, len_D):
        sorted_CARS = []
        for values in self.rule_builder.CARS.values():
            sorted_CARS.extend(list(values.items()))
        self.sorted_CARS = sorted(sorted_CARS, key=lambda x: (x[1][1][x[1][0]] / sum(x[1][1].values()), x[1][1][x[1][0]] / len_D, len(x[0])), reverse=True)

    def build_classifier(self, df, target_col):
        len_D = len(df)
        self.sort_rules(len_D)
        temp_df = df
        rules = []

        if len(self.sorted_CARS)==0:
            print('No CARS from rule generator!')
            self.rules = []
            return
        for CARS in self.sorted_CARS:
            cond, result = CARS
            cond_df = temp_df.loc[(temp_df[list(cond)] == pd.Series(cond)).all(axis=1)]
            correct = cond_df[cond_df[target_col] == result[0]]

            if len(correct) != 0:
                temp_df = temp_df.drop(index=cond_df.index)
                if len(temp_df) == 0:
                    default_class = random.choice(df[target_col].unique())
                else:
                    default_class = temp_df[target_col].value_counts().idxmax()
                total_error = (len(cond_df) - len(correct)) + len(temp_df[temp_df[target_col] != default_class])
                error = {'default': len(temp_df[temp_df[target_col] != default_class]),
                         'class': (len(cond_df) - len(correct))}
                rules.append([CARS, default_class, total_error, error])

        lowest_error_id = np.argmin([x[2] for x in rules])
        pruned_rules = rules[:lowest_error_id + 1]
        self.rules = pruned_rules

    def predict(self, df):
        temp_df = df.copy()
        ans = df.copy()
        # Setting all to default class
        if len(self.rules)==0:
            print('No rules!')
            return None
        ans['prediction'] = self.rules[-1][1]
        for rule in self.rules:
            cond, default_class, _, _ = rule
            cond, prediction = cond
            # Filtering rows that fulfil rule condition
            cond_df = temp_df.loc[(temp_df[list(cond)] == pd.Series(cond)).all(axis=1)]
            # Setting prediction
            ans.loc[cond_df.index, 'prediction'] = prediction[0]
            # Removing rows that has been predicted
            temp_df = temp_df.drop(index=cond_df.index)

        return ans


In [70]:
classifier = Classifier(rule_gen)

In [72]:
classifier.build_classifier(df, 'type')

In [73]:
ans = classifier.predict(df)

In [74]:
ans['correct'] = (ans['type']==ans['prediction'])
ans['correct'].value_counts()

True     73
False    27
Name: correct, dtype: int64

# Weighted RuleGenerator

In [363]:
import pandas as pd
import numpy as np
from frozendict import frozendict
import random

class RuleGenerator:
    def __init__(self, min_sup=0.2, min_conf=0.6, weighted=False):
        self.min_sup = min_sup
        self.min_conf = min_conf
        self.CARS = None
        self.weighted = weighted

    def generate_rules(self, df, target_col):
        len_D = len(df)
        class_labels = df[target_col].unique()
        variables = list(df.columns)
        variables.remove(target_col)
        k = 1
        F_1 = {}
        for variable in variables:
            temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
            temp = temp.fillna(0)
            #     print(temp)
            temp.apply(lambda x: F_1.update(self.convert_to_rule_items(x, [variable], class_labels, len_D)), axis=1)

        self.CARS = {}
        self.CARS[1] = self.gen_rules(F_1, len_D)

        F = {}
        F[1] = F_1
        # print(F_1.keys())
        frequent_variables = set([x for x in F_1.keys()])
        # print(frequent_variables)
        while len(F[k]) != 0:
            k += 1
            print(k)
            # Includes generating candidate itemset and finding frequent itemset
            F[k] = self.gen_itemset(F[k - 1], df, target_col, frequent_variables)
            frequent_variables = set([x for x in F[k].keys()])

            # Build rules from frequent itemset
            self.CARS[k] = self.gen_rules(F[k], len_D)
            # Pruning rules
            self.CARS[k] = self.prune_rules(self.CARS[k], self.CARS[k - 1])

    def convert_to_rule_items(self, x, variables, class_label, len_D):
        condset = {}
        for variable in variables:
            condset[variable] = x[variable]

        condset = frozendict(condset)
        class_count = {}
        # When label don't exist in group
        for label in class_label:
            if label not in x:
                class_count[label] = 0
            else:
                class_count[label] = x[label]
        major_class = max(class_count, key=lambda x: class_count[x])

        # Removing non frequent itemset
        #         if (class_count[major_class]/float(len_D))<self.min_sup:
        #             return {}

        if self.weighted:
            for label in class_label:
                if (class_count[label] / float(len_D)) >= self.min_sup[label]:
                    return {condset: (major_class, class_count)}
        else:
            if (class_count[major_class] / float(len_D)) >= self.min_sup:
                return {condset: (major_class, class_count)}

        return {}

    def gen_rules(self, F_k, len_D):

        rules = {}

        for condset, (major_class, class_count) in F_k.items():

            conf = class_count[major_class] / float(sum(class_count.values()))

            if conf > self.min_conf:
                # Checking if the support are the same for all classes
                if set(class_count.values()) == 1:
                    # Choose a random class
                    major_class = random.choice(class_count.keys())

                rules[condset] = (major_class, class_count)

        return rules

    def gen_itemset(self, F, df, target_col, itemsets):
        F_new = {}
        itemsets = set(itemsets)

        condset = list(F.keys())
        len_D = len(df)
        class_labels = df[target_col].unique()
        for i in range(len(condset)):
            cond = condset[i]
            temp = df.copy()
            cols = []
            for item in cond.items():
                value = item[1]
                col = item[0]
                temp = temp[temp[col] == value]
                cols.append(col)
            checked = []
            for j in range(i + 1, len(condset)):
                itemset = condset[j]
                # Apriori principle, where we merge 2 frequent superset into a candidate key
                # Checking if 2 itemsets differ only by 2 conditions
                itemset_keys = itemset.keys()
                cond_keys = cond.keys()
                if len(set(itemset_keys) - set(cond_keys)) == 1 and len(set(itemset.items()) ^ set(cond.items())) == 2:
                    variable = (set(itemset_keys) - set(cond_keys)).pop()
                    if variable not in checked:
                        checked.append(variable)
                        value = itemset.get(variable)

                        # Candidate generation
                        groupby = cols + [variable, target_col]

                        # Calculating frequency
                        group = temp.groupby(by=groupby).size().unstack(level=-1).reset_index()
                        group = group.fillna(0)
                        # Converting candidates in frequent itemset
                        group.apply(
                            lambda x: F_new.update(self.convert_to_rule_items(x, groupby[:-1], class_labels, len_D)),
                            axis=1)

        return F_new

    def prune_rules(self, r, r_):
        # r and r_ is a python dict
        to_remove = []
        for superset in r.keys():
            # superset is a frozendict
            superset_set = set(superset.items())
            superset_value = r[superset]

            for subset in r_.keys():
                # subset is a frozendict
                subset_set = set(subset.items())

                if subset_set < superset_set:
                    subset_value = r_[subset]

                    superset_class = superset_value[0]
                    superset_error = superset_value[1][superset_class] / sum(superset_value[1].values())

                    subset_class = subset_value[0]
                    subset_error = subset_value[1][subset_class] / sum(subset_value[1].values())

                    if superset_error >= subset_error:
                        to_remove.append(superset)
                        # break to as the superset has already been removed, no further testing is needed
                        break
        # print(len(r))
        for key in to_remove:
            r.pop(key)
        # print(len(r))
        return r


In [181]:
rule_gen = RuleGenerator(min_sup=0.01, min_conf=0.5)

In [182]:
%%time
rule_gen.generate_rules(df, 'Attrition_Flag')

1
2
3
4
5
6
7
8
9
CPU times: total: 35min 44s
Wall time: 35min 45s


In [183]:
classifier = Classifier(rule_gen)

In [184]:
classifier.build_classifier(df, 'Attrition_Flag')

In [185]:
ans = classifier.predict(df)

In [186]:
ans['correct'] = (ans['Attrition_Flag']==ans['prediction'])
ans['correct'].value_counts()

True     8555
False    1572
Name: correct, dtype: int64

In [187]:
len(ans[ans['correct']==True])/len(ans)

0.8447714031796189

In [190]:
total = 0
for i in rule_gen.CARS.values():
    total += len(i)
total

18868

# Adaboost

In [350]:
df = pd.read_csv('./data/bank_churners/clean_bank_churners.csv')
train_df, test_df = train_test_split(df, test_size=100, random_state=42)

In [353]:
class Adaboost:
    
    def __init__(self, T=50):
        self.T = T
        self.models = []
        self.stumps = {}
    
    def train(self, train_df, target_col, min_sup=0.01, min_conf=0.5):
        weights = np.ones(len(train_df))/len(train_df)
        train_variables = list(train_df.columns)
        train_variables.remove(target_col)
        for t in range(self.T):
            self.stumps[t] = {}
            if t!=0:
                temp_df = train_df.sample(frac=1, replace=True, weights=weights, random_state=42)
            else:
                temp_df = train_df.copy()
            for variable in train_variables:
                stump_df = temp_df[[variable, target_col]]
                rule_gen = RuleGenerator(min_sup=min_sup, min_conf=min_conf)
                rule_gen.generate_rules(stump_df, target_col)
                classifier = Classifier(rule_gen)
                classifier.build_classifier(stump_df, target_col)
                
                # TODO: Ugly Code! Need to fix the whole no rules thing
                # For case if no rules generated
                if len(classifier.rules)==0:
                    continue
                ans = classifier.predict(train_df[[variable,target_col]])
                ans['correct'] = (ans[target_col]==ans['prediction'])
                acc = len(ans[ans['correct']==True])/len(ans)

                self.stumps[t][variable] = {'acc':acc, 'model': classifier}

            max_acc_variable = max(self.stumps[t].items(), key=lambda x: x[1]['acc'])
        #     print(max_acc_variable)

            ans = max_acc_variable[1]['model'].predict(train_df[[max_acc_variable[0],target_col]])
            ans['wrong'] = (ans[target_col]!=ans['prediction'])

            error = np.dot(weights, (ans['wrong'])) / sum(weights)
            alpha = np.log((1-error)/error)

            weights = weights * np.exp(alpha*ans['wrong'])
            weights = weights/sum(weights)
            self.models.append({'alpha': alpha, 'model': max_acc_variable[1]['model'], 'variable': max_acc_variable[0]})
            
        return self
    
    def predict(self, test_df):
        ans = pd.DataFrame(index=test_df.index)
        for i in range(len(self.models)):
            model = self.models[i]

            temp = model['model'].predict(test_df)
            classes = set(temp['prediction'].unique()) - set(ans.columns)
            if classes:
                ans = ans.join(pd.DataFrame(columns=list(classes)))
                ans = ans.fillna(0)
            ans = ans+pd.get_dummies(temp['prediction'])*model['alpha']
        ans['prediction'] = ans.idxmax(axis='columns')
        ans = test_df.merge(ans, left_index=True, right_index=True)
        return ans

In [355]:
adaboost = Adaboost(50)
adaboost.train(train_df, 'Attrition_Flag')
ans = adaboost.predict(test_df)
ans['correct'] = (ans['Attrition_Flag']==ans['prediction'])
ans['correct'].value_counts()

No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from rule generator!
No CARS from r

True     89
False    11
Name: correct, dtype: int64

In [314]:
%%time
T = 50
target_col = 'class'
train_variables = list(train_df.columns)
train_variables.remove(target_col)

stumps = {}
models = []
weights = np.ones(len(train_df))/len(train_df)
for t in range(T):
    stumps[t] = {}
    if t!=0:
        temp_df = train_df.sample(frac=1, replace=True, weights=weights, random_state=42)
    else:
        temp_df = train_df.copy()
    for variable in train_variables:
        stump_df = temp_df[[variable, target_col]]
        rule_gen = RuleGenerator(min_sup=0.01, min_conf=0.5)
        rule_gen.generate_rules(stump_df, target_col)
        classifier = Classifier(rule_gen)
        classifier.build_classifier(stump_df, target_col)
        
        ans = classifier.predict(train_df[[variable,target_col]])
        ans['correct'] = (ans[target_col]==ans['prediction'])
        ans['correct'] = ans['correct'].replace([True, False], [1, -1])
        acc = len(ans[ans['correct']==1])/len(ans)

        stumps[t][variable] = {'acc':acc, 'model': classifier}
    
    max_acc_variable = max(stumps[t].items(), key=lambda x: x[1]['acc'])
#     print(max_acc_variable)
    
    ans = max_acc_variable[1]['model'].predict(train_df[[max_acc_variable[0],target_col]])
    ans['correct'] = (ans[target_col]==ans['prediction'])
    ans['wrong'] = (ans[target_col]!=ans['prediction'])
    
    error = np.dot(weights, (ans['wrong'])) / sum(weights)
    alpha = np.log((1-error)/error)
    
    weights = weights * np.exp(alpha*ans['wrong'])
    weights = weights/sum(weights)
#     print(weights)
    models.append({'alpha': alpha, 'model': max_acc_variable[1]['model'], 'variable': max_acc_variable[0]})

CPU times: total: 14.2 s
Wall time: 14.2 s


In [287]:
models

[{'alpha': 2.6631996349368636,
  'model': <__main__.Classifier at 0x1aa604c6d90>,
  'variable': 'uniformity_of_cell_size'},
 {'alpha': 1.2112581427676152,
  'model': <__main__.Classifier at 0x1aa604f6f40>,
  'variable': 'bland_chromatin'},
 {'alpha': 1.6630376796645054,
  'model': <__main__.Classifier at 0x1aa607b0d90>,
  'variable': 'bare_nuclei'},
 {'alpha': 1.0241779686657553,
  'model': <__main__.Classifier at 0x1aa5a9eb820>,
  'variable': 'bare_nuclei'},
 {'alpha': 0.2840486096559002,
  'model': <__main__.Classifier at 0x1aa6009cfd0>,
  'variable': 'bare_nuclei'},
 {'alpha': 0.8627428951481032,
  'model': <__main__.Classifier at 0x1aa60062e50>,
  'variable': 'uniformity_of_cell_shape'},
 {'alpha': 1.022290157034047,
  'model': <__main__.Classifier at 0x1aa5e88f550>,
  'variable': 'normal_nucleoli'},
 {'alpha': 1.5881301934000152,
  'model': <__main__.Classifier at 0x1aa60404070>,
  'variable': 'uniformity_of_cell_size'},
 {'alpha': 0.8689149121949261,
  'model': <__main__.Classifi

In [336]:
ans = pd.DataFrame(index=test_df.index)
for i in range(len(models)):
    model = models[i]

    temp = model['model'].predict(test_df)
    classes = set(temp['prediction'].unique()) - set(ans.columns)
    if classes:
        ans = ans.join(pd.DataFrame(columns=classes))
    ans = ans.fillna(0)
    ans = ans+pd.get_dummies(temp['prediction'])*model['alpha']
ans['prediction'] = ans.idxmax(axis='columns')
ans = test_df.merge(ans, left_index=True, right_index=True)

In [337]:
ans['correct'] = (ans['class']==ans['prediction'])

In [338]:
ans['correct'].value_counts()

True     98
False     2
Name: correct, dtype: int64

In [318]:
ans = np.zeros(len(test_df))
alphas = 0
for i in range(len(models)):
    model = models[i]
    
    temp = model['model'].predict(test_df)
    ans += temp['prediction']*model['alpha']
    alphas += model['alpha']

ans = ans/alphas

pred_df = test_df.copy()

pred_df['prediction'] = ans

pred_df['prediction'] = pred_df['prediction']-3
pred_df['class'] = pred_df['class']-3

pred_df['correct'] = (((pred_df['prediction']<0) & (pred_df['class']<0))|((pred_df['prediction']>0) & (pred_df['class']>0)))

In [319]:
pred_df['correct'].value_counts()

True     98
False     2
Name: correct, dtype: int64

In [None]:
df = pd.read_csv('data/breast_w/clean_breast_W.csv')

In [364]:
rule_gen = RuleGenerator(min_sup=0.01, min_conf=0.5)

In [365]:
%%time
rule_gen.generate_rules(train_df, 'Attrition_Flag')

2
3
4
5


KeyboardInterrupt: 

In [366]:
classifier = Classifier(rule_gen)

In [367]:
classifier.build_classifier(train_df, 'Attrition_Flag')

In [368]:
ans = classifier.predict(test_df)

In [369]:
ans['correct'] = (ans['Attrition_Flag']==ans['prediction'])
ans['correct'].value_counts()

True     88
False    12
Name: correct, dtype: int64