In [1]:
import numpy as np
import pandas as pd
from apriori import Apriori
from time import time
from helper import GetItemsetFromFile, GetSubsectionOfData, PrintTimeInfo, CreatePairDataFrame

In [2]:
# Get the retail data set
s_t = time()
retail_data = GetItemsetFromFile("retail.data")
f_t = time()
print("Time taken to read retail data: %.2f seconds" % (f_t - s_t))

Time taken to read retail data: 0.19 seconds


In [3]:
support_thresholds = [0.01, 0.02, 0.05]
data_sizes = [0.25, 0.33, 0.65]

In [4]:
# Run Apriori pairs with Random Sampling on 3 support threshholds and 3 pre-determined data sizes
random_item_pairings = []
for data_size in data_sizes:
    # Loop through each support to test on data
    for support in support_thresholds:
        data = GetSubsectionOfData(retail_data, data_size) # Get % of data
        new_support = support * data_size # reduce the support threshold relative to reduction in data

        s_t = time()    # Starting time
        occ, freq = Apriori(data, new_support, 2)
        f_t = time()    # Finishing time

        PrintTimeInfo("Random Sampled Apriori", (data_size * 100), (f_t - s_t), new_support, "pairs")
        random_item_pairings.append(freq)
        

Time taken to complete Random Sampled Apriori on 25% of data using pairs of retail data: 21.62 seconds with support: 0%
Time taken to complete Random Sampled Apriori on 25% of data using pairs of retail data: 7.65 seconds with support: 0%
Time taken to complete Random Sampled Apriori on 25% of data using pairs of retail data: 2.72 seconds with support: 1%
Time taken to complete Random Sampled Apriori on 33% of data using pairs of retail data: 17.31 seconds with support: 0%
Time taken to complete Random Sampled Apriori on 33% of data using pairs of retail data: 7.70 seconds with support: 0%
Time taken to complete Random Sampled Apriori on 33% of data using pairs of retail data: 2.56 seconds with support: 1%
Time taken to complete Random Sampled Apriori on 65% of data using pairs of retail data: 13.53 seconds with support: 0%
Time taken to complete Random Sampled Apriori on 65% of data using pairs of retail data: 5.49 seconds with support: 1%
Time taken to complete Random Sampled Apriori

In [5]:
# Run Apriori normally on 100% of data for all 3 support thresholds
apriori_item_pairings = []
for support in support_thresholds:
    s_t = time()
    occ, freq = Apriori(data, new_support, 2)
    f_t = time()
    apriori_item_pairings.append(freq)
    PrintTimeInfo("Apriori", 100, (f_t - s_t), support, "pairs")

Time taken to complete Apriori on 100% of data using pairs of retail data: 3.26 seconds with support: 1%
Time taken to complete Apriori on 100% of data using pairs of retail data: 3.55 seconds with support: 2%
Time taken to complete Apriori on 100% of data using pairs of retail data: 3.46 seconds with support: 5%


In [29]:
# Count the number of pairs that are in the random sampling but not in the actual data
false_positives_list = np.zeros((9,1))
for i in range(len(random_item_pairings)):
    for j in range(len(apriori_item_pairings)):
        for pairing in random_item_pairings[i]:
            if pairing not in apriori_item_pairings[j]:
                false_positives_list[i] += 1

In [59]:
i = 0
for data_size in data_sizes:
    for support in support_thresholds:
        print("False positives for original support threshold %d%% is %d with %d%% of data" % ((support * 100), false_positives_list[i], (data_size * 100)))
        i += 1

False positives for original support threshold 1% is 2556 with 25% of data
False positives for original support threshold 2% is 792 with 25% of data
False positives for original support threshold 5% is 129 with 25% of data
False positives for original support threshold 1% is 1614 with 33% of data
False positives for original support threshold 2% is 462 with 33% of data
False positives for original support threshold 5% is 72 with 33% of data
False positives for original support threshold 1% is 393 with 65% of data
False positives for original support threshold 2% is 90 with 65% of data
False positives for original support threshold 5% is 0 with 65% of data
