In [2]:
import numpy as np
import pandas as pd
import math
from apriori import Apriori
from time import time
from helper import GetItemsetFromFile, GetSubsectionOfData, SplitDataIntoChunks, PrintTimeInfo, CreatePairDataFrame

In [3]:
# Get the retail data set
s_t = time()
retail_data = GetItemsetFromFile("retail.data")
f_t = time()
print("Time taken to read retail data: %.2f seconds" % (f_t - s_t))

Time taken to read retail data: 0.17 seconds


In [4]:
support = 0.05 # 5%

In [10]:
 def SON(data_list, data_percentage, support):   
    data_length = len(data_list)
    data_length = math.floor(data_length * data_percentage)
    partial_data = []
    did_not_divide_nicely = False
    num_iterations = int(100 / (data_percentage * 100))
    list_of_frequent_item_lists = []
    partials = []

    # Remainder after dividing by the fraction
    if data_length % (data_percentage * 100) != 0:
        did_not_divide_nicely = True
        # Left overs is the remainder after dividing
        # In the case of 20%, if you round you get 5 data lengths of 17632
        # 17632 * 5 = 88160
        # Actual length of data is 88162
        # This gets the remaining 2 rows we would have missed so we can cover the entire dataset
        leftovers = round(data_length - (math.floor(data_length / num_iterations) * num_iterations))
        
    for i in range(1, num_iterations+1):
        data_size = data_length * i
        # if there was a remainder from division
        if did_not_divide_nicely == True:
            # If it is the last run of the loop
            if i == num_iterations:
                # Add the leftover value to the last pass to get the entire dataset
                data_size += leftovers

        partial_data = GetSubsectionOfData(data_list[data_length * (i-1):data_size], 1)
        partials.append(partial_data)

        # Support is divided by the fraction of data we look at
        occ, freq = Apriori(partial_data, support / num_iterations, 2)
        list_of_frequent_item_lists.append(freq)
    return list_of_frequent_item_lists

In [11]:
# Run SON with 20% of data chunks with 5% base support
freq_lists_20_percent = SON(retail_data, 0.2, support)

In [15]:
# Run SON with 50% of data chunks with 5% base support
freq_lists_50_percent = SON(retail_data, 0.5, support)

In [7]:
# Run Apriori at 5% support on 100% of data
occ, freq = Apriori(retail_data, support, 2)

In [12]:
# Find false positives in the 20% chunk size
false_positives_20 = []
for frequency_list in freq_lists_20_percent:
    for item in frequency_list:
        if item not in freq:
            false_positives_20.append(item)

In [14]:
print("False positives for SON using %d%% support threshold spliting the data into chunks of %d%% is %d" % ( 5, 20, len(false_positives) ))

False positives for SON using 5% support threshold spliting the data into chunks of 20% is 358
