In [1]:
from ap_utils import *

In [2]:
import numpy as np
import pandas as pd
import pprint

In [3]:
from load_and_preprocess_data import load_data, get_label_appended_data

## Loading the data

In [4]:
path_to_data = '../data/test_data.txt'
min_support = 0.02
min_confidence = 0.02

In [5]:
data = load_data('../data/diabetes.csv')
order = [col for col in data.columns]
transformed_data = get_label_appended_data(data)

In [6]:
transactions = transformed_data.to_numpy()[:25]

In [7]:
print(order)

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


## Initialization

In [8]:
C = {}
L = {}
itemset_size = 1
discarded = {itemset_size: []}
items = []
for transaction in transactions:
    for item in transaction:
        items.append([item])
items.sort(key = lambda x: order.index(x[0].split(',')[1]))
C.update({itemset_size: items})

In [9]:
def print_table(T, supp_count):
    print('Itemset | Frequency')
    for k in range(len(T)):
        print(f'{T[k]} : {supp_count[k]}')
    print()
    print()

In [10]:
supp_count_L = {}
f, sup, new_discarded = get_frequent(C[itemset_size], transactions, min_support, discarded)
discarded.update({itemset_size: new_discarded})
L.update({itemset_size: f})
supp_count_L.update({itemset_size: sup})

In [11]:
print_table(L[1], supp_count_L[1])

Itemset | Frequency
['6,Pregnancies'] : 1
['1,Pregnancies'] : 5
['8,Pregnancies'] : 3
['1,Pregnancies'] : 5
['0,Pregnancies'] : 2
['5,Pregnancies'] : 2
['3,Pregnancies'] : 2
['10,Pregnancies'] : 3
['2,Pregnancies'] : 1
['8,Pregnancies'] : 3
['4,Pregnancies'] : 1
['10,Pregnancies'] : 3
['10,Pregnancies'] : 3
['1,Pregnancies'] : 5
['5,Pregnancies'] : 2
['7,Pregnancies'] : 3
['0,Pregnancies'] : 2
['7,Pregnancies'] : 3
['1,Pregnancies'] : 5
['1,Pregnancies'] : 5
['3,Pregnancies'] : 2
['8,Pregnancies'] : 3
['7,Pregnancies'] : 3
['9,Pregnancies'] : 1
['11,Pregnancies'] : 1
['148,Glucose'] : 1
['85,Glucose'] : 1
['183,Glucose'] : 1
['89,Glucose'] : 1
['137,Glucose'] : 1
['116,Glucose'] : 1
['78,Glucose'] : 1
['115,Glucose'] : 2
['197,Glucose'] : 1
['125,Glucose'] : 1
['110,Glucose'] : 1
['168,Glucose'] : 1
['139,Glucose'] : 1
['189,Glucose'] : 1
['166,Glucose'] : 1
['100,Glucose'] : 1
['118,Glucose'] : 1
['107,Glucose'] : 1
['103,Glucose'] : 1
['115,Glucose'] : 2
['126,Glucose'] : 1
['99,Gluc

In [12]:
k = itemset_size + 1
convergence = False
while convergence == False:
    C.update({k: join_set_itemsets(L[k - 1], order)})
    print(f'Table C{k}: \n')
    print(len(C[k]))
#     print_table(C[k], [count_occurences(it, transactions) for it in C[k]])
    f, sup, new_discarded = get_frequent(C[k], transactions, min_support, discarded)
    discarded.update({k: new_discarded})
    L.update({k: f})
    supp_count_L.update({k: sup})
    if len(L[k]) == 0:
        convergence = True
    else:
        print(f'Table L{k}: \n')
        print(len(L[k]))
#         print_table(L[k], supp_count_L[k])
    k += 1

Table C2: 

22500
Table L2: 

7069
Table C3: 

0


In [13]:
pprint.pprint(L[k - 2])

[['6,Pregnancies', '148,Glucose'],
 ['6,Pregnancies', '72,BloodPressure'],
 ['6,Pregnancies', '72,BloodPressure'],
 ['6,Pregnancies', '35,SkinThickness'],
 ['6,Pregnancies', '35,SkinThickness'],
 ['6,Pregnancies', '35,SkinThickness'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '0,Insulin'],
 ['6,Pregnancies', '33.6,BMI'],
 ['6,Pregnancies', '0.627,DiabetesPedigreeFunction'],
 ['6,Pregnancies', '50,Age'],
 ['6,Pregnancies', '50,Age'],
 ['6,Pregnancies', '1,Outcome'],
 ['6,Pregnancies', '1,Outcome'],
 ['6,Pregnancies', '1,Outcome'],
 ['6,Pregnancies', '1,Outcome'],
 ['6,Pregnancies', '1,Out

## Generate Rules

In [14]:
from itertools import combinations, chain
def powerset(s):
    return list(chain.from_iterable(combinations(s, r) for r in range(1, len(s) + 1)))

In [15]:
def write_rules(X, X_S, S, conf, supp, lift, num_transactions):
    rule = [S, X_S]
    out_rules = ''
    out_rules += f'Freq. Itemset: {X}\n'
    out_rules += f'\tRule: {S} -> {X_S}\n'
    out_rules += f'\tConf: {0:2.3f} '.format(conf)
    out_rules += f'\tSupp: {0:2.3f} '.format(supp/num_trans)
    out_rules += f'\tLift: {0:2.3f}\n'.format(lift)
    return out_rules, rule

In [16]:
assoc_rules_str = ''
rules_list = []
num_trans = len(transactions)
for i in range(1, len(L)):
    for j in range(len(L[i])):
        s = list(powerset(set(L[i][j])))
        s.pop()
        for z in s:
            S = set(z)
            X = set(L[i][j])
            X_S = set(X - S)
            sup_x = count_occurences(X, transactions)
            sup_x_s = count_occurences(X_S, transactions)
            conf = sup_x/count_occurences(S, transactions)
            lift = sup_x/(sup_x_s/num_trans)
            if conf >= min_confidence and sup_x >= min_support:
                rule_output, rule = write_rules(X, X_S, S, conf, sup_x, lift, num_trans)
                assoc_rules_str += rule_output
                rules_list.append(rule)

In [17]:
print(assoc_rules_str)

Freq. Itemset: {'6,Pregnancies', '148,Glucose'}
	Rule: {'6,Pregnancies'} -> {'148,Glucose'}
	Conf: 0.000 	Supp: 0.000 	Lift: 0.000
Freq. Itemset: {'6,Pregnancies', '148,Glucose'}
	Rule: {'148,Glucose'} -> {'6,Pregnancies'}
	Conf: 0.000 	Supp: 0.000 	Lift: 0.000
Freq. Itemset: {'6,Pregnancies', '72,BloodPressure'}
	Rule: {'6,Pregnancies'} -> {'72,BloodPressure'}
	Conf: 0.000 	Supp: 0.000 	Lift: 0.000
Freq. Itemset: {'6,Pregnancies', '72,BloodPressure'}
	Rule: {'72,BloodPressure'} -> {'6,Pregnancies'}
	Conf: 0.000 	Supp: 0.000 	Lift: 0.000
Freq. Itemset: {'6,Pregnancies', '72,BloodPressure'}
	Rule: {'6,Pregnancies'} -> {'72,BloodPressure'}
	Conf: 0.000 	Supp: 0.000 	Lift: 0.000
Freq. Itemset: {'6,Pregnancies', '72,BloodPressure'}
	Rule: {'72,BloodPressure'} -> {'6,Pregnancies'}
	Conf: 0.000 	Supp: 0.000 	Lift: 0.000
Freq. Itemset: {'35,SkinThickness', '6,Pregnancies'}
	Rule: {'35,SkinThickness'} -> {'6,Pregnancies'}
	Conf: 0.000 	Supp: 0.000 	Lift: 0.000
Freq. Itemset: {'35,SkinThickness

In [18]:
for rule in rules_list:
    if len(rule[1]) == 1 and list(rule[1])[0].split(',')[1] == order[-1]:
        print(rule)

[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'6,Pregnancies'}, {'1,Outcome'}]
[{'1,Pregnancies'}, {'1,Outcome'}]
[{'1,Pregnancies'}, {'0,Outcome'}]
[{'1,Pregnancies'}, {'1,Outcome'}]
[{'1,Pregnancies'}, {'0,Outcome'}]
[{'1,Pregnancies'}, {'1,Outcome'}]
[{'1,Pregnancies'}, {'0,Outcome'}]
[{'1,Pregnancies'}, {'1,Outcome'}]
[{'1,Pregnancies'}, {'0,Outcome'}]
[{'1,Pregnancies'}, {'1,Outcome'}]
[{'1,Pregnancies'}, {'1,Outcome'}]
[{'1,Pregnancies'}, {'0,Outcome'}]
[{'1,Pregnancies'}, {'1,Outcome'}]
[{'1,Pregnancies'}, 

In [19]:
features = []
for rule in rules_list:
    if len(rule[1]) == 1 and list(rule[1])[0].split(',')[1] == order[-1]:
        features.append(rule[0])

In [20]:
features

[{'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'6,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'1,Pregnancies'},
 {'8,Pregnancies'},
 {'8,Pregnancies'},
 {'8,Pregnancies'},
 {'8,Pregnancies'},
 {'8,Pregnancies'},
 {'8,Pregnancies'},
 {'8,Pregnancies'},
 {'8,Pregnancies'},
 {'8,Pregnancies'},
