## Apriori Frequent Pattern Mining improved using Transaction Reduction

In [71]:
import pandas as pd
import numpy as np
from itertools import combinations
import time

In [72]:
# Reading data file
adult = pd.read_excel('adult.xlsx')
adult.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Edunum,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [73]:
# Removing irrelevant columns
adult.drop(['fnlwgt', 'Edunum'], axis = 1, inplace = True)
adult.head()

Unnamed: 0,Age,Workclass,Education,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [74]:
# Size of original data
adult.shape

(32561, 13)

In [75]:
# Removing missing values from data
adult.replace(' ?', np.NaN, inplace = True)
adult.dropna(axis = 0, inplace = True)

# Size of data after removing missing values
adult.shape
adult.head()

Unnamed: 0,Age,Workclass,Education,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [76]:
# Size of data after removing missing values and resetting index
adult.shape
adult.reset_index(drop = True, inplace = True)
adult.tail()

Unnamed: 0,Age,Workclass,Education,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
30157,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
30158,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
30159,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
30160,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
30161,52,Self-emp-inc,HS-grad,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [77]:
# Converting numeric data to categorical data
adult['Age'] = pd.cut(adult['Age'], [0, 25, 40, np.inf], labels=["young", "middle_aged", "old"])
adult['Gain'] = pd.cut(adult['Gain'], [-1, 1, np.inf], labels=["No_Gain", "Gain"])
adult['Loss'] = pd.cut(adult['Loss'], [-1, 1, np.inf], labels=["No_Loss", "Loss"])
adult['Hoursperweek'] = pd.cut(adult['Hoursperweek'], [-1, 5, 20, 60, np.inf], labels=["Less", "Medium", "Reasonable", "High"])
adult.head()

Unnamed: 0,Age,Workclass,Education,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,middle_aged,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,Gain,No_Loss,Reasonable,United-States,<=50K
1,old,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,No_Gain,No_Loss,Medium,United-States,<=50K
2,middle_aged,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,No_Gain,No_Loss,Reasonable,United-States,<=50K
3,old,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,No_Gain,No_Loss,Reasonable,United-States,<=50K
4,middle_aged,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,No_Gain,No_Loss,Reasonable,Cuba,<=50K


In [78]:
# Generating Candidate 1
def C1(data):
    candidates = {} # empty dictionary to store each item as key and its count as value
    for i in data.index:
        for j in data.loc[i]:
            if (j in candidates): 
                candidates[j] += 1
            else: 
                candidates[j] = 1
    print("\nC1: \n", candidates)
    return candidates

In [79]:
# Generating L1
def L1(c, support):
    l1 = {} # dictionary to store all the items having support more than or equal to minimum support
    l1 = dict((k,v) for k, v in c.items() if v >= support)
    print("\n\nL1: \n", l1)
    return l1

In [80]:
# Function to check if candidate itemset contains infrequent subset
def has_infrequent_subset(candidate, freq, prevL):
    for i in list(combinations(candidate,freq-1)):
        if i not in prevL:
            return True
        return False

In [81]:
# Generating Candidate k
def Ck(k, prevL, df):
    
    # Join
    if k == 2:
        c = list()
        for key,v in prevL.items():
            if key not in c:
                c.append(key)
            
    # When k is greater than 2, L (k-1) contains list of tuples
    if k > 2:
        c = list()
        for key,v in prevL.items():
            for item in key:
                if item not in c:
                    c.append(item)
                    
    candidates = {} # To store all the candidate items along with its count
    cand = list(combinations(c, k))
    
    # Prune
    for cd in cand:
        if (has_infrequent_subset(cd, k, prevL) == True):
            cand.remove(cd)
    
    count = {}
    for i in cand:
        candidates[i] = 0
    for i in range(len(df)):
        count[i] = 0
        
    for ind in df.index:
        for i in candidates:
            if set(i).issubset(df.loc[ind]):
                candidates[i] +=1
                count[ind] += 1
                
    print("\n\nC", k, ":\n", candidates)
    
    if candidates != {}:
        for ind in df.index:   
            if (count[ind] == 0):
                df.loc[ind] = np.NaN
            
        df.dropna(axis = 0, inplace = True)        
        adult.reset_index(drop = True, inplace = True)
    
    print("\nShape after reduction:")
    print(df.shape)
    
    return candidates

In [82]:
# Generating Lk
def Lk(k, ck, support, df):
    lk = {}
    count = {}
    lk = dict((key,v) for key, v in ck.items() if v >= support)

    print("\n\nL", k, ":\n", lk)
    return lk

In [83]:
# Function for Apriori Algorithm
def apriori(data, support):
    t1 = time.time()
    s = int((support/100)*len(data))
    print("Min sup = ", s)
    lk = {}
    k = 2
    candidateK = {}
    candidateK = C1(data)
    lk = L1(candidateK, s)
    while lk != {}:
        candidateK = Ck(k, lk, data)
        lk = Lk(k, candidateK, s, data)
        k += 1
    t2 = time.time()
    exec_time = t2 - t1
    print("\nExecution time = ", exec_time)
    return

In [85]:
apriori(adult, 50)

Min sup =  15080

C1: 
 {'middle_aged': 12091, ' State-gov': 1279, ' Bachelors': 5044, ' Never-married': 9726, ' Adm-clerical': 3721, ' Not-in-family': 7725, ' White': 25933, ' Male': 20380, 'Gain': 2538, 'No_Loss': 28734, 'Reasonable': 26721, ' United-States': 27504, ' <=50K': 22654, 'old': 12401, ' Self-emp-not-inc': 2498, ' Married-civ-spouse': 14064, ' Exec-managerial': 3990, ' Husband': 12463, 'No_Gain': 27622, 'Medium': 2277, ' Private': 22286, ' HS-grad': 9839, ' Divorced': 4213, ' Handlers-cleaners': 1350, ' 11th': 1048, ' Black': 2817, ' Prof-specialty': 4038, ' Wife': 1405, ' Female': 9780, ' Cuba': 92, ' Masters': 1627, ' 9th': 455, ' Married-spouse-absent': 370, ' Other-service': 3212, ' Jamaica': 80, ' >50K': 7506, ' Some-college': 6677, 'High': 1051, ' Asian-Pac-Islander': 893, ' India': 100, 'young': 5668, ' Own-child': 4466, ' Assoc-acdm': 1008, ' Sales': 3584, ' 7th-8th': 557, ' Transport-moving': 1572, ' Amer-Indian-Eskimo': 286, ' Mexico': 610, ' Farming-fishing': 98


Shape after reduction:
(29967, 13)


L 4 :
 {(' White', 'No_Loss', ' United-States', 'No_Gain'): 20836, (' White', 'No_Loss', ' United-States', ' Male'): 15898, (' White', 'No_Loss', ' United-States', 'Reasonable'): 20267, (' White', 'No_Loss', ' United-States', ' <=50K'): 17100, (' White', 'No_Loss', ' United-States', ' Private'): 16895, (' White', 'No_Loss', 'No_Gain', ' Male'): 15253, (' White', 'No_Loss', 'No_Gain', 'Reasonable'): 19725, (' White', 'No_Loss', 'No_Gain', ' <=50K'): 17670, (' White', 'No_Loss', 'No_Gain', ' Private'): 16755, (' White', 'No_Loss', ' Male', 'Reasonable'): 15352, (' White', 'No_Loss', 'Reasonable', ' <=50K'): 16056, (' White', 'No_Loss', 'Reasonable', ' Private'): 16314, (' White', ' United-States', 'No_Gain', ' Male'): 15140, (' White', ' United-States', 'No_Gain', 'Reasonable'): 19417, (' White', ' United-States', 'No_Gain', ' <=50K'): 16922, (' White', ' United-States', 'No_Gain', ' Private'): 16292, (' White', ' United-States', ' Male', 'Reasonable