In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('breast_w/clean_breast_w.csv')

In [3]:
df.head()

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhension,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [5]:
variables = df.columns[:-1]
target_col = df.columns[-1]
class_labels = df['class'].unique()

In [6]:
class_labels

array([2, 4], dtype=int64)

In [119]:
def convert_to_rule_items(x, variables, class_label, min_sup, len_D):
#     print(x)
    condset = tuple([(variable, x[variable]) for variable in variables])
    
    class_count = {}
    for label in class_label:
        if label not in x:
            class_count[label] = 0
        else:
            class_count[label] = x[label]
#     print(class_count)
#     class_count = x[class_label].to_dict()
    condsup_count = sum(class_count.values())
    # Removing non frequent itemset
    if (condsup_count/len_D)<min_sup:
#         print('Pruning')
        return {}
#     print({condset: [condsup_count, class_count]})
    return {condset: (condsup_count, class_count)}

In [120]:
# RG Line 2 genRules(F_k)

def gen_rules(F_k, len_D, min_conf):
    
    rules = []
    
    for condset, (condsup_count, class_count) in F_k.items():
        major_class = max(class_count, key=lambda x: class_count[x])
        
        conf = class_count[major_class] / float(condsup_count)
        
        if conf>min_conf:
            # Checking if the support are the same for all classes
            if set(class_count.values())==1:
                # Choose a random class
                major_class = random.choice(class_count.keys())
                
            rules.append((condset,major_class))
            
    return rules

In [127]:
def gen_itemset(F, df, target_col, variables):
    F_new = {}
    variables = set(variables)
    for condset in F.keys():
#         print(f'New Set: {condset}')
        temp = df.copy()
        cols = []
        for item in condset:
            value = item[1]
            col = item[0]
            temp = temp[temp[col]==value]
            
            # Ensure that subsequent itemset generation do not generate same itemset
            # This 1 line cuts down a lot of permutations of the same itemset
            df = df[df[col]!=value]
            cols.append(col)
        remaining = variables - set(cols)
        # TODO: Currently there is both candidate generation
        for variable in remaining:
            # Candidate generation
            groupby = cols+[variable, target_col]
            
            # Calculating frequency
            group = temp.groupby(by=groupby).size().unstack(level=-1).reset_index()
            group = group.fillna(0)
            
            # Converting candidates in frequent itemset
            group.apply(lambda x: F_new.update(convert_to_rule_items(x, groupby[:-1], class_labels, min_sup, len_D)), axis=1)


    return F_new

In [128]:
%%time
# RG Line 1
# Format of F_k: 
# key = itemset
# value = [no. of appearance of itemset, dictionary with key as class_label and value as no. of class_label in itemset]

min_sup=0.2
min_conf=0.7
len_D = len(df)

k=1
F = {}
for variable in variables:
    temp = df.groupby(by=[variable, target_col]).size().unstack(level=1).reset_index()
    temp = temp.fillna(0)
#     print(temp)
    temp.apply(lambda x: F.update(convert_to_rule_items(x, [variable], class_labels, min_sup, len_D)), axis=1)

CARS = {}
# CAR_1 = genRules(F_1)
CARS[1] = gen_rules(F, len_D, min_conf)

while len(F)!=0:
    k+=1
    # Includes generating candidate itemset and finding frequent itemset
    F = gen_itemset(F, df, target_col, variables)
    # Build rules from frequent itemset
    CARS[k] = gen_rules(F, len_D, min_conf)


CPU times: total: 484 ms
Wall time: 478 ms


In [129]:
CARS

{1: [((('clump_thickness', 1.0),), 2),
  ((('uniformity_of_cell_size', 1.0),), 2),
  ((('uniformity_of_cell_shape', 1.0),), 2),
  ((('marginal_adhension', 1.0),), 2),
  ((('single_epithelial_cell_size', 2.0),), 2),
  ((('bare_nuclei', 1.0),), 2),
  ((('bland_chromatin', 1.0),), 2),
  ((('bland_chromatin', 2.0),), 2),
  ((('bland_chromatin', 3.0),), 2),
  ((('normal_nucleoli', 1.0),), 2),
  ((('mitoses', 1.0),), 2)],
 2: [((('uniformity_of_cell_size', 1.0), ('marginal_adhension', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('mitoses', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('single_epithelial_cell_size', 2.0)),
   2),
  ((('uniformity_of_cell_size', 1.0), ('uniformity_of_cell_shape', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('bare_nuclei', 1.0)), 2),
  ((('uniformity_of_cell_size', 1.0), ('normal_nucleoli', 1.0)), 2)],
 3: [((('uniformity_of_cell_size', 1.0),
    ('marginal_adhension', 1.0),
    ('uniformity_of_cell_shape', 1.0)),
   2),
  ((('uniformity_of_cell_s