In [1]:
from apriori import Apriori
from helper import GetItemsetFromFile, SaveDataToFile, CreateSupportList, SaveDataFrameToHTMLFile, GetSubsectionOfData
from time import time
import pandas as pd
import numpy as np

In [2]:
s_t = time()
retail_data = GetItemsetFromFile("retail.data")
f_t = time()
print("Time taken to read retail data: %.2f seconds" % (f_t - s_t))

Time taken to read retail data: 0.21 seconds


In [23]:
support = 0.01

In [24]:
### 20% OF RETAIL DATA
retail_20 = GetSubsectionOfData(retail_data, .2)

20% of 71064 is 14213


In [25]:
s_t = time()
pair_occ, pair_freq = Apriori(retail_20, support, 2)
f_t = time()
print("Time taken to perform Apriori pairs on 20%% of retail data: %.2f seconds, with support %.2f" % ((f_t - s_t), support))

Reading line 0 of 14213
Reading line 0 of 14213 in pass two
Time taken to perform Apriori pairs on 20% of retail data: 2.03 seconds, with support 0.01


In [26]:
data = np.asarray(list((zip(pair_freq,pair_occ))))
data1 = np.array([item[0] for item in data[:,0]])
data2 = np.array([item[1] for item in data[:,0]])
pair_df = pd.DataFrame({'Item 1': data1, 'Item 2': data2, 'Support': data[:,-1] / len(retail_data)})
SaveDataFrameToHTMLFile(pair_df, "apri-ret-p-20-5")

In [27]:
### 40% OF RETAIL DATA
retail_40 = GetSubsectionOfData(retail_data, .4)

40% of 71064 is 28426


In [28]:
s_t = time()
pair_occ, pair_freq = Apriori(retail_40, support, 2)
f_t = time()
print("Time taken to perform Apriori pairs on 40%% of retail data: %.2f seconds, with support %.2f" % ((f_t - s_t), support))

Reading line 0 of 28426
Reading line 0 of 28426 in pass two
Time taken to perform Apriori pairs on 40% of retail data: 4.72 seconds, with support 0.01


In [29]:
data = np.asarray(list((zip(pair_freq,pair_occ))))
data1 = np.array([item[0] for item in data[:,0]])
data2 = np.array([item[1] for item in data[:,0]])
pair_df = pd.DataFrame({'Item 1': data1, 'Item 2': data2, 'Support': data[:,-1] / len(retail_data)})
SaveDataFrameToHTMLFile(pair_df, "apri-ret-p-40-5")

In [30]:
### 100% OF DATA

In [31]:
s_t = time()
pair_occ, pair_freq = Apriori(retail_data, support, 2)
f_t = time()
print("Time taken to perform Apriori pairs on 100%% of retail data: %.2f seconds, with support %.2f" % ((f_t - s_t), support))

Reading line 0 of 71064
Reading line 50000 of 71064
Reading line 0 of 71064 in pass two
Reading line 50000 of 71064 in pass two
Time taken to perform Apriori pairs on 100% of retail data: 9.86 seconds, with support 0.01


In [32]:
data = np.asarray(list((zip(pair_freq,pair_occ))))
data1 = np.array([item[0] for item in data[:,0]])
data2 = np.array([item[1] for item in data[:,0]])
pair_df = pd.DataFrame({'Item 1': data1, 'Item 2': data2, 'Support': data[:,-1] / len(retail_data)})
SaveDataFrameToHTMLFile(pair_df, "apri-ret-p-100-5")

In [79]:
### APRORI TRIPPLES
support = 0.01

In [80]:
s_t = time()
triple_freq_20 = Apriori(retail_20, support, 3)
f_t = time()
print("Time taken to perform Apriori triples on 20%% of retail data: %.2f seconds, with support %.2f" % ((f_t - s_t), support))

Reading line 0 of 14213
Reading line 0 of 14213 in pass two
Time taken to perform Apriori triples on 20% of retail data: 2.01 seconds, with support 0.01


In [81]:
triple_array = np.asarray([np.array(x) for x in triple_freq_20])
triple_df = pd.DataFrame({'Item 1': triple_array[:,0], 'Item 2': triple_array[:,1], 'Item 3': triple_array[:,-1]})
SaveDataFrameToHTMLFile(triple_df, "apri-ret-t-20-1")

In [82]:
## 40% OF RETAIL DATA WITH TRIPLES

In [83]:
s_t = time()
triple_freq_40 = Apriori(retail_40, support, 3)
f_t = time()
print("Time taken to perform Apriori triples on 40%% of retail data: %.2f seconds, with support %.2f" % ((f_t - s_t), support))

Reading line 0 of 28426
Reading line 0 of 28426 in pass two
Time taken to perform Apriori triples on 40% of retail data: 4.54 seconds, with support 0.01


In [84]:
triple_array = np.asarray([np.array(x) for x in triple_freq_40])
triple_df = pd.DataFrame({'Item 1': triple_array[:,0], 'Item 2': triple_array[:,1], 'Item 3': triple_array[:,-1]})
SaveDataFrameToHTMLFile(triple_df, "apri-ret-t-40-1")

In [85]:
### 100% OF RETAIL DATA WITH TRIPLES

In [86]:
s_t = time()
triple_freq_100 = Apriori(retail_data, support, 3)
f_t = time()
print("Time taken to perform Apriori triples on 100%% of retail data: %.2f seconds, with support %.2f" % ((f_t - s_t), support))

Reading line 0 of 71064
Reading line 50000 of 71064
Reading line 0 of 71064 in pass two
Reading line 50000 of 71064 in pass two
Time taken to perform Apriori triples on 100% of retail data: 9.60 seconds, with support 0.01


In [87]:
triple_array = np.asarray([np.array(x) for x in triple_freq_100])
triple_df = pd.DataFrame({'Item 1': triple_array[:,0], 'Item 2': triple_array[:,1], 'Item 3': triple_array[:,-1]})
SaveDataFrameToHTMLFile(triple_df, "apri-ret-t-100-1")

In [None]:
### NETFLIX STUFF ###

In [2]:
s_t = time()
netflix_data = GetItemsetFromFile("netflix.data")
f_t = time()
print("Time taken to read netflix data: %.2f seconds" % (f_t - s_t))

Time taken to read netflix data: 14.03 seconds


In [3]:
### 40% OF NETFLIX DATA
netflix_40 = GetSubsectionOfData(netflix_data, .4)

40% of 480188 is 192075


In [4]:
support = 0.01
s_t = time()
occ_1, freq_1 = Apriori(netflix_40, support, 2)
f_t = time()
print("Time taken to perform Apriori pairs on 40%% of netflix data: %.2f seconds, with support %.2f%%" % ((f_t - s_t), (support*100)))

Reading line 0 of 192075
Reading line 50000 of 192075
Reading line 100000 of 192075
Reading line 150000 of 192075
Reading line 0 of 192075 in pass two
Reading line 1 of 192075 in pass two
Reading line 2 of 192075 in pass two
Reading line 3 of 192075 in pass two
Reading line 4 of 192075 in pass two
Reading line 5 of 192075 in pass two
Reading line 6 of 192075 in pass two
Reading line 7 of 192075 in pass two
Reading line 8 of 192075 in pass two
Reading line 9 of 192075 in pass two
Reading line 10 of 192075 in pass two
Reading line 11 of 192075 in pass two
Reading line 12 of 192075 in pass two


KeyboardInterrupt: 

In [None]:
netflix_returned = np.asarray(list((zip(freq_1,occ_1))))
data1 = np.array([item[0] for item in data[:,0]])
data2 = np.array([item[1] for item in data[:,0]])
df_netflix = pd.DataFrame({'Item 1': data1, 'Item 2': data2, 'Support': data[:,-1] / len(retail_data)})