## Apriori Frequent Pattern Mining

In [27]:
# Importing required libraries
import pandas as pd
import numpy as np
from itertools import combinations
import time

In [28]:
# Reading data file
adult = pd.read_excel('adult.xlsx')
adult.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Edunum,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [29]:
# Removing irrelevant columns
adult.drop(['fnlwgt', 'Edunum'], axis = 1, inplace = True)
adult.head()

Unnamed: 0,Age,Workclass,Education,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [30]:
# Size of original data
adult.shape

(32561, 13)

In [31]:
# Removing missing values from data
adult.replace(' ?', np.NaN, inplace = True)
adult.dropna(axis = 0, inplace = True)

# Size of data after removing missing values
adult.shape

(30162, 13)

In [32]:
# Converting numeric data to categorical data
adult['Age'] = pd.cut(adult['Age'], [0, 25, 40, np.inf], labels=["young", "middle_aged", "old"])
adult['Gain'] = pd.cut(adult['Gain'], [-1, 1, np.inf], labels=["No_Gain", "Gain"])
adult['Loss'] = pd.cut(adult['Loss'], [-1, 1, np.inf], labels=["No_Loss", "Loss"])
adult['Hoursperweek'] = pd.cut(adult['Hoursperweek'], [-1, 5, 20, 60, np.inf], labels=["Less", "Medium", "Reasonable", "High"])
adult.head()

Unnamed: 0,Age,Workclass,Education,Marital,Occupation,Relationship,Race,Sex,Gain,Loss,Hoursperweek,Country,Income
0,middle_aged,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,Gain,No_Loss,Reasonable,United-States,<=50K
1,old,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,No_Gain,No_Loss,Medium,United-States,<=50K
2,middle_aged,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,No_Gain,No_Loss,Reasonable,United-States,<=50K
3,old,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,No_Gain,No_Loss,Reasonable,United-States,<=50K
4,middle_aged,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,No_Gain,No_Loss,Reasonable,Cuba,<=50K


In [33]:
# Generating Candidate 1
def C1(data):
    candidates = {} # empty dictionary to store each item as key and its count as value
    for i in data.index:
        for j in data.loc[i]:
            if (j in candidates): 
                candidates[j] += 1
            else: 
                candidates[j] = 1
    print("\nC1: \n", candidates)
    return candidates

In [34]:
# Generating L1
def L1(c, support):
    l1 = {} # dictionary to store all the items having support more than or equal to minimum support
    l1 = dict((k,v) for k, v in c.items() if v >= support)
    print("\n\nL1: \n", l1)
    return l1

In [35]:
# Function to check if candidate itemset contains infrequent subset
def has_infrequent_subset(candidate, freq, prevL):
    for i in list(combinations(candidate,freq-1)):
        if i not in prevL:
            return True
        return False

In [36]:
# Generating Candidate k
def Ck(k, prevL, df):
    
    # Join
    # For C2:
    if k == 2:
        c = list()
        for key,v in prevL.items():
            if key not in c:
                c.append(key)
            
    # When k is greater than 2, L (k-1) contains list of tuples
    if k > 2:
        c = list()
        for key,v in prevL.items():
            for item in key:
                if item not in c:
                    c.append(item)
                    
    candidates = {} # To store all the candidate items along with its count
    cand = list(combinations(c, k))
    
    # Prune
    for cd in cand:
        if (has_infrequent_subset(cd, k, prevL) == True):
            cand.remove(cd)
            
    for i in cand:
        candidates[i] = 0
    for ind in df.index:
        for i in candidates:
            if set(i).issubset(df.loc[ind]):
                candidates[i] +=1
                
    print("\n\nC", k, ":\n", candidates)
    return candidates

In [37]:
# Generating Lk
def Lk(k, ck, support):
    lk = {}
    # lk contains all the values having support more than minimum support threshold
    lk = dict((key,v) for key, v in ck.items() if v >= support)
    print("\n\nL", k, ":\n", lk)
    return lk

In [38]:
# Function for Apriori Algorithm
def apriori(data, support):
    
    t1 = time.time()
    s = int((support/100)*len(data))
    print("Minimum support = ", s)
    
    lk = {}
    candidateK = {}
    candidateK = C1(data)
    lk = L1(candidateK, s)
    
    k = 2
    while lk != {}:
        candidateK = Ck(k, lk, data)
        lk = Lk(k, candidateK, s)
        k += 1
    t2 = time.time()
    
    exec_time = t2 - t1
    
    print("\nTotal execution time = ", exec_time)

    return

In [41]:
apriori(adult, 30)

Minimum support =  12064

C1: 
 {'middle_aged': 12092, ' State-gov': 1279, ' Bachelors': 5044, ' Never-married': 9726, ' Adm-clerical': 3721, ' Not-in-family': 7726, ' White': 25933, ' Male': 20380, 'Gain': 2538, 'No_Loss': 28735, 'Reasonable': 26722, ' United-States': 27504, ' <=50K': 22654, 'old': 12402, ' Self-emp-not-inc': 2499, ' Married-civ-spouse': 14065, ' Exec-managerial': 3992, ' Husband': 12463, 'No_Gain': 27624, 'Medium': 2277, ' Private': 22286, ' HS-grad': 9840, ' Divorced': 4214, ' Handlers-cleaners': 1350, ' 11th': 1048, ' Black': 2817, ' Prof-specialty': 4038, ' Wife': 1406, ' Female': 9782, ' Cuba': 92, ' Masters': 1627, ' 9th': 455, ' Married-spouse-absent': 370, ' Other-service': 3212, ' Jamaica': 80, ' >50K': 7508, ' Some-college': 6678, 'High': 1052, ' Asian-Pac-Islander': 895, ' India': 100, 'young': 5668, ' Own-child': 4466, ' Assoc-acdm': 1008, ' Sales': 3584, ' 7th-8th': 557, ' Transport-moving': 1572, ' Amer-Indian-Eskimo': 286, ' Mexico': 610, ' Farming-fish



C 4 :
 {(' White', ' Male', 'Reasonable', ' <=50K'): 10786, (' White', ' Male', 'Reasonable', 'No_Gain'): 14612, (' White', ' Male', 'Reasonable', 'No_Loss'): 15352, (' White', ' Male', 'Reasonable', ' United-States'): 15155, (' White', ' Male', 'Reasonable', ' Private'): 11827, (' White', ' Male', ' <=50K', 'No_Gain'): 11598, (' White', ' Male', ' <=50K', 'No_Loss'): 11769, (' White', ' Male', ' <=50K', ' United-States'): 11229, (' White', ' Male', ' <=50K', ' Private'): 9092, (' White', ' Male', 'No_Gain', 'No_Loss'): 15253, (' White', ' Male', 'No_Gain', ' United-States'): 15140, (' White', ' Male', 'No_Gain', ' Private'): 11730, (' White', ' Male', 'No_Loss', ' United-States'): 15898, (' White', ' Male', 'No_Loss', ' Private'): 12270, (' White', ' Male', ' United-States', ' Private'): 11956, (' White', 'Reasonable', ' <=50K', 'No_Gain'): 15873, (' White', 'Reasonable', ' <=50K', 'No_Loss'): 16056, (' White', 'Reasonable', ' <=50K', ' United-States'): 15308, (' White', 'Reasonable



C 6 :
 {(' White', ' Male', 'Reasonable', 'No_Gain', 'No_Loss', ' United-States'): 12733, (' White', ' Male', 'Reasonable', 'No_Gain', 'No_Loss', ' Private'): 10115, (' White', ' Male', 'Reasonable', 'No_Gain', 'No_Loss', ' <=50K'): 9923, (' White', ' Male', 'Reasonable', 'No_Gain', ' United-States', ' Private'): 9877, (' White', ' Male', 'Reasonable', 'No_Gain', ' United-States', ' <=50K'): 9454, (' White', ' Male', 'Reasonable', 'No_Loss', ' United-States', ' Private'): 10347, (' White', ' Male', 'Reasonable', 'No_Loss', ' United-States', ' <=50K'): 9578, (' White', ' Male', 'Reasonable', ' United-States', ' Private', ' <=50K'): 7452, (' White', ' Male', 'No_Gain', 'No_Loss', ' United-States', ' Private'): 10193, (' White', ' Male', 'No_Gain', 'No_Loss', ' United-States', ' <=50K'): 10314, (' White', ' Male', 'No_Gain', ' United-States', ' Private', ' <=50K'): 7898, (' White', 'Reasonable', 'No_Gain', 'No_Loss', ' United-States', ' Private'): 13729, (' White', 'Reasonable', 'No_Gai