In [1]:
import numpy as np
import pandas as pd
from time import time
from pcy import PCY
from helper import GetItemsetFromFile, PrintTimeInfo, SaveDataFrameToHTMLFile, SaveDataFrameToHTMLFile, CreateTripleDataFrame, GetSubsectionOfData, CreatePairDataFrame

In [2]:
# Get Retail Data

In [3]:
s_t = time()
retail_data = GetItemsetFromFile("retail.data")
f_t = time()
print("Time taken to read retail data: %.2f seconds" % (f_t - s_t))

Time taken to read retail data: 0.20 seconds


In [4]:
support_values = [0.01, 0.02, 0.05]
split_data_values = [0.2, 0.4, 1]

In [5]:
# Perfom PCY on pairs of data for all support values and split data values
for data_split in split_data_values:
    for support in support_values:
        data = GetSubsectionOfData(retail_data, data_split) # Get % of data
        file_name = "pcy-ret-p-"+str(round(data_split*100))+"-" + str(round(support*100))   # Set file name

        s_t = time()    # Start Time
        occ, freq = PCY(data, support, 2)
        f_t = time()    # End Time
        PrintTimeInfo("PCY", (data_split * 100), (f_t - s_t), support, "pairs")
        df = CreatePairDataFrame(occ, freq, len(data))
        SaveDataFrameToHTMLFile(df, file_name)
    print("Done running PCY pairs on %d%% of data" % (data_split * 100)+"\n")

Time taken to complete PCY on 20% of data using pairs of retail data: 4.75 seconds with support: 1%
Time taken to complete PCY on 20% of data using pairs of retail data: 3.43 seconds with support: 2%
Time taken to complete PCY on 20% of data using pairs of retail data: 2.99 seconds with support: 5%
Done running PCY pairs on 20% of data

Time taken to complete PCY on 40% of data using pairs of retail data: 9.25 seconds with support: 1%
Time taken to complete PCY on 40% of data using pairs of retail data: 6.14 seconds with support: 2%
Time taken to complete PCY on 40% of data using pairs of retail data: 5.41 seconds with support: 5%
Done running PCY pairs on 40% of data

Time taken to complete PCY on 100% of data using pairs of retail data: 24.99 seconds with support: 1%
Time taken to complete PCY on 100% of data using pairs of retail data: 19.27 seconds with support: 2%
Time taken to complete PCY on 100% of data using pairs of retail data: 14.11 seconds with support: 5%
Done running PCY

In [6]:
# Perfom PCY on triples of data for all support values and split data values
for data_split in split_data_values:
    for support in support_values:
        data = GetSubsectionOfData(retail_data, data_split) # Get % of data
        file_name = "pcy-ret-t-"+str(round(data_split*100))+"-" + str(round(support*100))   # Set file name

        s_t = time()    # Start Time
        freq = PCY(data, support, 3)
        f_t = time()    # End Time
        PrintTimeInfo("PCY", (data_split * 100), (f_t - s_t), support, "triples")
        df = CreateTripleDataFrame(freq)
        SaveDataFrameToHTMLFile(df, file_name)
    print("Done running PCY triples on %d%% of data" % (data_split * 100)+"\n")

Time taken to complete PCY on 20% of data using triples of retail data: 4.52 seconds with support: 1%
Time taken to complete PCY on 20% of data using triples of retail data: 3.15 seconds with support: 2%
Time taken to complete PCY on 20% of data using triples of retail data: 2.64 seconds with support: 5%
Done running PCY triples on 20% of data

Time taken to complete PCY on 40% of data using triples of retail data: 8.98 seconds with support: 1%
Time taken to complete PCY on 40% of data using triples of retail data: 6.40 seconds with support: 2%
Time taken to complete PCY on 40% of data using triples of retail data: 5.40 seconds with support: 5%
Done running PCY triples on 40% of data

Time taken to complete PCY on 100% of data using triples of retail data: 22.80 seconds with support: 1%
Time taken to complete PCY on 100% of data using triples of retail data: 16.92 seconds with support: 2%
Time taken to complete PCY on 100% of data using triples of retail data: 14.46 seconds with suppor