In [552]:
import pandas as pd
import numpy as np
import itertools

In [553]:
# read in original data as a dataframe
df = pd.read_csv('adult.data', header=None, skipinitialspace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [554]:
df.info()
df_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship",
            "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       32561 non-null  int64 
 1   1       32561 non-null  object
 2   2       32561 non-null  int64 
 3   3       32561 non-null  object
 4   4       32561 non-null  int64 
 5   5       32561 non-null  object
 6   6       32561 non-null  object
 7   7       32561 non-null  object
 8   8       32561 non-null  object
 9   9       32561 non-null  object
 10  10      32561 non-null  int64 
 11  11      32561 non-null  int64 
 12  12      32561 non-null  int64 
 13  13      32561 non-null  object
 14  14      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [555]:
max_sample = df.max()
min_sample = df.min()
steps = []

for i in range(14):
    try:
        step = int(max_sample[i]) - int(min_sample[i])
        steps.append(np.ceil(step/10))
    except:
        steps.append(-1)

steps

[8.0, -1, 147242.0, -1, 2.0, -1, -1, -1, -1, -1, 10000.0, 436.0, 10.0, -1]

In [556]:
# preprocess data set
adult_data = []

for adult in df.values:
    adult_set = set()
    for i in range(14):
        # ignore missing index
        if adult[i] == "?":
            pass
        # convert continuous data to categorical ones based on the steps.
        elif i in {0, 2, 10, 11, 12}:
            adult_set.add(df_names[i] + str(int(np.floor(adult[i]/steps[i]))))
        # ignore repeated data
        elif i == 4:
            pass
        else:
            adult_set.add(adult[i])
    adult_data.append(adult_set)

adult_data = adult_data[:4]
adult_data[0]

{'Adm-clerical',
 'Bachelors',
 'Male',
 'Never-married',
 'Not-in-family',
 'State-gov',
 'United-States',
 'White',
 'age4',
 'capital-gain0',
 'capital-loss0',
 'fnlwgt0',
 'hours-per-week4'}

In [557]:
# first scan
def first_scan(data_set, min_support):
    c1 = []
    supports = []

    # generate Candidate C1 and count support
    for transaction in data_set:
        for item in transaction:
            if not {item} in c1:
                c1.append({item})
                supports.append(1)
            else:
                supports[c1.index({item})] += 1

    # compare candidates with min_support
    item_set = []
    frequent_dict = []
    for idx in range(len(c1)):
        if supports[idx] >= min_support:
            item_set.append(c1[idx])
            frequent_dict.append((c1[idx], supports[idx]))

    # generate new candidates
    temp = list(itertools.combinations(item_set, 2))
    temp = [set.union(combination[0], combination[1]) for combination in temp]
    new_item_set = []
    [new_item_set.append(candidate) for candidate in temp if not i in new_item_set]

    return frequent_dict, new_item_set

adult_c1 = first_scan(adult_data, 3)
adult_c1

([({'hours-per-week4'}, 3),
  ({'capital-loss0'}, 4),
  ({'Male'}, 4),
  ({'United-States'}, 4),
  ({'White'}, 3),
  ({'capital-gain0'}, 4)],
 [{'capital-loss0', 'hours-per-week4'},
  {'Male', 'hours-per-week4'},
  {'United-States', 'hours-per-week4'},
  {'White', 'hours-per-week4'},
  {'capital-gain0', 'hours-per-week4'},
  {'Male', 'capital-loss0'},
  {'United-States', 'capital-loss0'},
  {'White', 'capital-loss0'},
  {'capital-gain0', 'capital-loss0'},
  {'Male', 'United-States'},
  {'Male', 'White'},
  {'Male', 'capital-gain0'},
  {'United-States', 'White'},
  {'United-States', 'capital-gain0'},
  {'White', 'capital-gain0'}])

In [558]:
# a helper function use to get the subsets with a specific length from a iterable object
def power_set(iterable):
    s = list(iterable)
    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1))

def k_subsets(s, k):
    return [set(item) for item in power_set(s) if len(item) == k]

k_subsets({"a", "b", "c"}, 2)

[{'a', 'b'}, {'a', 'c'}, {'b', 'c'}]

In [559]:
# general scan function
def scan(data_set, ck, min_support, print_flag=False):
    supports = [0 for candidate in ck]

    # count support of these candidates
    for transaction in data_set:
        for candidate in ck:
            if candidate.issubset(transaction):
                supports[ck.index(candidate)] += 1

    # compare candidates with min_support
    frequent_item_sets = []
    frequent_dict= []
    for idx in range(len(ck)):
        if supports[idx] >= min_support:
            frequent_item_sets.append(ck[idx])
            frequent_dict.append((ck[idx], supports[idx]))
            if print_flag:
                print(ck[idx], supports[idx])

    if not frequent_item_sets:
        return []

    # generate new candidates
    k_num = len(frequent_item_sets[0])

    temp = list(itertools.combinations(frequent_item_sets, 2))
    temp = [combination[0].union(combination[1]) for combination in temp
            if combination[0]&combination[1] and len(combination[0]&combination[1]) == k_num-1]

    new_item_set = []

    # check whether those candidates repeated or have infrequent subsets
    for candidate in temp:
        if candidate not in new_item_set:
            subsets = k_subsets(candidate, k_num)
            flag = True
            for item in subsets:
                if item not in frequent_item_sets:
                    flag = False
                    break
            if flag:
                new_item_set.append(candidate)

    return frequent_dict, new_item_set

adult_c2 = scan(adult_data, adult_c1[1], 3, True)

adult_c2

{'hours-per-week4', 'capital-loss0'} 3
{'hours-per-week4', 'Male'} 3
{'hours-per-week4', 'United-States'} 3
{'hours-per-week4', 'capital-gain0'} 3
{'Male', 'capital-loss0'} 4
{'United-States', 'capital-loss0'} 4
{'White', 'capital-loss0'} 3
{'capital-gain0', 'capital-loss0'} 4
{'United-States', 'Male'} 4
{'White', 'Male'} 3
{'capital-gain0', 'Male'} 4
{'White', 'United-States'} 3
{'capital-gain0', 'United-States'} 4
{'White', 'capital-gain0'} 3


([({'capital-loss0', 'hours-per-week4'}, 3),
  ({'Male', 'hours-per-week4'}, 3),
  ({'United-States', 'hours-per-week4'}, 3),
  ({'capital-gain0', 'hours-per-week4'}, 3),
  ({'Male', 'capital-loss0'}, 4),
  ({'United-States', 'capital-loss0'}, 4),
  ({'White', 'capital-loss0'}, 3),
  ({'capital-gain0', 'capital-loss0'}, 4),
  ({'Male', 'United-States'}, 4),
  ({'Male', 'White'}, 3),
  ({'Male', 'capital-gain0'}, 4),
  ({'United-States', 'White'}, 3),
  ({'United-States', 'capital-gain0'}, 4),
  ({'White', 'capital-gain0'}, 3)],
 [{'Male', 'capital-loss0', 'hours-per-week4'},
  {'United-States', 'capital-loss0', 'hours-per-week4'},
  {'capital-gain0', 'capital-loss0', 'hours-per-week4'},
  {'Male', 'United-States', 'hours-per-week4'},
  {'Male', 'capital-gain0', 'hours-per-week4'},
  {'United-States', 'capital-gain0', 'hours-per-week4'},
  {'Male', 'United-States', 'capital-loss0'},
  {'Male', 'White', 'capital-loss0'},
  {'Male', 'capital-gain0', 'capital-loss0'},
  {'United-States', '

In [560]:
# integrate previous functions

def apriori(data_set, min_sup):
    frequent_item_set = []

    temp_frequent_dict, ck = first_scan(data_set, min_sup)
    frequent_item_set += temp_frequent_dict

    while ck:
        temp_frequent_dict, ck = scan(data_set, ck, min_sup)
        frequent_item_set += temp_frequent_dict

    return frequent_item_set

adult_fre_item_set = apriori(adult_data, 3)
adult_fre_item_set

[({'hours-per-week4'}, 3),
 ({'capital-loss0'}, 4),
 ({'Male'}, 4),
 ({'United-States'}, 4),
 ({'White'}, 3),
 ({'capital-gain0'}, 4),
 ({'capital-loss0', 'hours-per-week4'}, 3),
 ({'Male', 'hours-per-week4'}, 3),
 ({'United-States', 'hours-per-week4'}, 3),
 ({'capital-gain0', 'hours-per-week4'}, 3),
 ({'Male', 'capital-loss0'}, 4),
 ({'United-States', 'capital-loss0'}, 4),
 ({'White', 'capital-loss0'}, 3),
 ({'capital-gain0', 'capital-loss0'}, 4),
 ({'Male', 'United-States'}, 4),
 ({'Male', 'White'}, 3),
 ({'Male', 'capital-gain0'}, 4),
 ({'United-States', 'White'}, 3),
 ({'United-States', 'capital-gain0'}, 4),
 ({'White', 'capital-gain0'}, 3),
 ({'Male', 'capital-loss0', 'hours-per-week4'}, 3),
 ({'United-States', 'capital-loss0', 'hours-per-week4'}, 3),
 ({'capital-gain0', 'capital-loss0', 'hours-per-week4'}, 3),
 ({'Male', 'United-States', 'hours-per-week4'}, 3),
 ({'Male', 'capital-gain0', 'hours-per-week4'}, 3),
 ({'United-States', 'capital-gain0', 'hours-per-week4'}, 3),
 ({'Mal

In [561]:
# the confidence for two frequent item
def confidence(frequent_item0, frequent_item1, frequent_item_set):
    union = frequent_item0.union(frequent_item1)
    union_support = 0
    item0_support = 0

    for item in frequent_item_set:
        if item[0] == union:
            union_support = item[1]
        if item[0] == frequent_item0:
            item0_support = item[1]
        if union_support and item0_support:
            break
    if item0_support:
        return union_support/item0_support
    else:
        return 0

print('Male => {United_States, hours-per-week4}, Confidence =',
      confidence({'Male'}, {'United-States', 'hours-per-week4'} ,adult_fre_item_set))

Male => {United_States, hours-per-week4}, Confidence = 0.75
