## Apriori Frequent Pattern Mining improved using Sampling

In [177]:
# Importing required libraries
import pandas as pd
import numpy as np
from itertools import combinations
import time

In [178]:
# Reading data file
adult = pd.read_excel('adult.xlsx')
adult.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Edunum,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [179]:
# Removing irrelevant columns
adult.drop(['fnlwgt', 'Edunum'], axis = 1, inplace = True)
adult.head()

Unnamed: 0,Age,Workclass,Education,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [180]:
# Size of original data
adult.shape

(32561, 13)

In [181]:
# Removing missing values from data
adult.replace(' ?', np.NaN, inplace = True)
adult.dropna(axis = 0, inplace = True)

# Size of data after removing missing values
adult.shape

(30162, 13)

In [182]:
# Converting numeric data to categorical data
adult['Age'] = pd.cut(adult['Age'], [0, 25, 40, np.inf], labels=["young", "middle_aged", "old"])
adult['Gain'] = pd.cut(adult['Gain'], [-1, 1, np.inf], labels=["No_Gain", "Gain"])
adult['Loss'] = pd.cut(adult['Loss'], [-1, 1, np.inf], labels=["No_Loss", "Loss"])
adult['Hoursperweek'] = pd.cut(adult['Hoursperweek'], [-1, 5, 20, 60, np.inf], labels=["Less", "Medium", "Reasonable", "High"])
adult.head()

Unnamed: 0,Age,Workclass,Education,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,middle_aged,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,Gain,No_Loss,Reasonable,United-States,<=50K
1,old,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,No_Gain,No_Loss,Medium,United-States,<=50K
2,middle_aged,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,No_Gain,No_Loss,Reasonable,United-States,<=50K
3,old,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,No_Gain,No_Loss,Reasonable,United-States,<=50K
4,middle_aged,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,No_Gain,No_Loss,Reasonable,Cuba,<=50K


In [183]:
# Generate a sampled dataset from the input data
def sampling(df, samplingFactor):
    sampledData = df.sample(frac = samplingFactor)
    return sampledData

In [184]:
# Generating Candidate 1
def C1(data):
    candidates = {} # empty dictionary to store each item as key and its count as value
    for i in data.index:
        for j in data.loc[i]:
            if (j in candidates): 
                candidates[j] += 1
            else: 
                candidates[j] = 1
    print("\nC1: \n", candidates)
    return candidates

In [185]:
# Generating L1
def L1(c, support):
    l1 = {} # dictionary to store all the items having support more than or equal to minimum support
    l1 = dict((k,v) for k, v in c.items() if v >= support)
    print("\n\nL1: \n", l1)
    return l1

In [186]:
# Function to check if candidate itemset contains infrequent subset
def has_infrequent_subset(candidate, freq, prevL):
    for i in list(combinations(candidate,freq-1)):
        if i not in prevL:
            return True
        return False

In [187]:
# Generating Candidate k
def Ck(k, prevL, df):
    
    # Join
    # For C2:
    if k == 2:
        c = list()
        for key,v in prevL.items():
            if key not in c:
                c.append(key)
            
    # When k is greater than 2, L (k-1) contains list of tuples
    if k > 2:
        c = list()
        for key,v in prevL.items():
            for item in key:
                if item not in c:
                    c.append(item)
                    
    candidates = {} # To store all the candidate items along with its count
    cand = list(combinations(c, k))
    
    # Prune
    for cd in cand:
        if (has_infrequent_subset(cd, k, prevL) == True):
            cand.remove(cd)
            
    for i in cand:
        candidates[i] = 0
    for ind in df.index:
        for i in candidates:
            if set(i).issubset(df.loc[ind]):
                candidates[i] +=1
                
    print("\n\nC", k, ":\n", candidates)
    return candidates

In [188]:
# Generating Lk
def Lk(k, ck, support):
    lk = {}
    # lk contains all the values having support more than minimum support threshold
    lk = dict((key,v) for key, v in ck.items() if v >= support)
    print("\n\nL", k, ":\n", lk)
    return lk

In [189]:
# Function for Apriori Algorithm
def apriori(data, support, samplingFactor):
    
    s = int((support/100)*0.9*len(data))
    df = sampling(data, samplingFactor)
    t1 = time.time()
    
    print("Minimum support = ", s)
    
    lk = {}
    candidateK = {}
    candidateK = C1(df)
    lk = L1(candidateK, s)
    
    k = 2
    while lk != {}:
        candidateK = Ck(k, lk, data)
        lk = Lk(k, candidateK, s)
        k += 1
    t2 = time.time()
    
    exec_time = t2 - t1
    
    print("\nTotal execution time = ", exec_time)

    return

In [190]:
apriori(adult, 50, 0.6)

Minimum support =  13572

C1: 
 {'middle_aged': 7244, ' Private': 13355, ' HS-grad': 5919, ' Married-civ-spouse': 8511, ' Exec-managerial': 2361, ' Wife': 852, ' Black': 1704, ' Female': 5816, 'No_Gain': 16585, 'No_Loss': 17212, 'Reasonable': 16030, ' United-States': 16489, ' <=50K': 13625, ' Bachelors': 3007, ' Husband': 7542, ' White': 15520, ' Male': 12281, ' >50K': 4472, ' 11th': 618, ' Craft-repair': 2450, 'Loss': 885, 'old': 7479, ' 9th': 272, ' Adm-clerical': 2176, 'Gain': 1512, ' Transport-moving': 995, ' State-gov': 798, ' Some-college': 4000, ' Protective-serv': 386, ' Local-gov': 1215, 'young': 3374, ' Never-married': 5781, ' Not-in-family': 4613, ' Self-emp-not-inc': 1506, ' Sales': 2109, ' Own-child': 2661, 'Medium': 1363, ' Masters': 973, ' Prof-specialty': 2427, ' Divorced': 2518, ' Other-service': 1959, ' Asian-Pac-Islander': 539, ' Philippines': 109, ' Machine-op-inspct': 1184, ' Unmarried': 1902, 'High': 633, ' Handlers-cleaners': 816, ' Other-relative': 527, ' Self-e