
# READING DATASET

In [4]:
import pandas as pd
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#reading dataset from url
dataset = pd.read_csv('/content/drive/MyDrive/dataset_apriori.csv')

In [6]:
dataset

Unnamed: 0,tid,items
0,1,"Handphone,Laptop"
1,2,"Handphone,Charger,Laptop"
2,3,"Powerbank,Laptop,Charger,Handphone"
3,4,"Tablet,Laptop,Handphone"
4,5,"Handphone,Charger,Tablet"
5,6,"Tablet,Powerbank"
6,7,"Handphone,Laptop,Tablet,Charger"
7,8,"Charger,Handphone"
8,9,"Handphone,Powerbank"
9,10,"Laptop,Charger,Powerbank"


# SPLITTING DATASET

In [7]:
#splitting tid and items columns of df into two different dataframe
df_items = dataset['items']
df_tid = dataset['tid']

In [8]:
df_items

0                      Handphone,Laptop
1              Handphone,Charger,Laptop
2    Powerbank,Laptop,Charger,Handphone
3               Tablet,Laptop,Handphone
4              Handphone,Charger,Tablet
5                      Tablet,Powerbank
6       Handphone,Laptop,Tablet,Charger
7                     Charger,Handphone
8                   Handphone,Powerbank
9              Laptop,Charger,Powerbank
Name: items, dtype: object

In [9]:
df_tid

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: tid, dtype: int64

# Convert Items to Number

In [10]:
#assigning unique index for different unique items
dictionaries = {'Handphone': 1, 'Laptop': 2, 'Charger': 3, 'Powerbank': 4, 'Tablet': 5 }

In [11]:
comma_splitted_df = df_items.apply(lambda x: x.split(','))

In [12]:
comma_splitted_df

0                        [Handphone, Laptop]
1               [Handphone, Charger, Laptop]
2    [Powerbank, Laptop, Charger, Handphone]
3                [Tablet, Laptop, Handphone]
4               [Handphone, Charger, Tablet]
5                        [Tablet, Powerbank]
6       [Handphone, Laptop, Tablet, Charger]
7                       [Charger, Handphone]
8                     [Handphone, Powerbank]
9               [Laptop, Charger, Powerbank]
Name: items, dtype: object

In [13]:
numbered_col = []
for i in range(len(comma_splitted_df)):
    list_numbered = list(map(lambda x: dictionaries[x], comma_splitted_df[i]))
    sort_numbered = sorted(list_numbered)
    numbered_col.append(sort_numbered)

numbered_col

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 5],
 [1, 3, 5],
 [4, 5],
 [1, 2, 3, 5],
 [1, 3],
 [1, 4],
 [2, 3, 4]]

# Create Dataframe from Numbered Items

In [14]:
#creating dataframe of numbered_col
dict_data = {'items': numbered_col}
df = pd.DataFrame.from_dict(dict_data)

In [15]:
df

Unnamed: 0,items
0,"[1, 2]"
1,"[1, 2, 3]"
2,"[1, 2, 3, 4]"
3,"[1, 2, 5]"
4,"[1, 3, 5]"
5,"[4, 5]"
6,"[1, 2, 3, 5]"
7,"[1, 3]"
8,"[1, 4]"
9,"[2, 3, 4]"


In [16]:
pd.concat([df_tid, df], axis=1)

Unnamed: 0,tid,items
0,1,"[1, 2]"
1,2,"[1, 2, 3]"
2,3,"[1, 2, 3, 4]"
3,4,"[1, 2, 5]"
4,5,"[1, 3, 5]"
5,6,"[4, 5]"
6,7,"[1, 2, 3, 5]"
7,8,"[1, 3]"
8,9,"[1, 4]"
9,10,"[2, 3, 4]"


In [17]:
items = []
for i in range(len(df)):
    for j in range(len(df['items'][i])):
        items.append(df['items'][i][j])
items

[1,
 2,
 1,
 2,
 3,
 1,
 2,
 3,
 4,
 1,
 2,
 5,
 1,
 3,
 5,
 4,
 5,
 1,
 2,
 3,
 5,
 1,
 3,
 1,
 4,
 2,
 3,
 4]

# Creating First Candidate (C1)

In [18]:
#Get unique element from list/array
unique_item = set(items)
unique_item

{1, 2, 3, 4, 5}

In [19]:
#Convert it to list
list_unique_item = list(unique_item)
list_unique_item

[1, 2, 3, 4, 5]

In [20]:
#counting frequency of every unique items
count_unique = []
for value in (list_unique_item):
    count_unique.append((value, items.count(value)))
count_unique

[(1, 8), (2, 6), (3, 6), (4, 4), (5, 4)]

In [22]:
candidate1_df = pd.DataFrame(count_unique, columns=["itemset", "sup"])

In [23]:
candidate1_df

Unnamed: 0,itemset,sup
0,1,8
1,2,6
2,3,6
3,4,4
4,5,4


# Creating first Frequent Itemset (L1)

In [24]:
#filtering items having minimum support count 6
def filter_sup(candidate):
    minimum_sup = 6
    filtering = candidate['sup'] > minimum_sup
    freq = candidate[filtering]
    return freq

In [25]:
freq_itemset1 = filter_sup(candidate1_df)

In [26]:
freq_itemset1

Unnamed: 0,itemset,sup
0,1,8


# Create the Second Candidate (C2)

**SELF JOIN**

In [27]:
import numpy
def self_join(prev_freq_itemset):
    self_join_candidate = []
    for i in range(len(prev_freq_itemset['itemset'])):
        for j in range((i+1), len(prev_freq_itemset['itemset'])):
            itemset_i = prev_freq_itemset['itemset'][i]
            itemset_j = prev_freq_itemset['itemset'][j]
            if(type(itemset_i) == numpy.int64 and type(itemset_j) == numpy.int64):
                itemset_i = {itemset_i}
                itemset_j = {itemset_j}
            union_candidate = itemset_i.union(itemset_j)

            if union_candidate not in self_join_candidate:
                self_join_candidate.append(union_candidate)
    return self_join_candidate

In [28]:
candidate2_list = self_join(freq_itemset1)

In [29]:
candidate2_list

[]

In [30]:
count_candidate2 = []

#Set the Initial value of Second Count Candidate (C2)
for i in range(len(candidate2_list)):
    count_candidate2.append((candidate2_list[i], 0))

count_candidate2

[]

In [31]:
initial_df_candidate = pd.DataFrame(count_candidate2, columns=['itemset', 'sup'])

In [32]:
initial_df_candidate

Unnamed: 0,itemset,sup


In [33]:
df

Unnamed: 0,items
0,"[1, 2]"
1,"[1, 2, 3]"
2,"[1, 2, 3, 4]"
3,"[1, 2, 5]"
4,"[1, 3, 5]"
5,"[4, 5]"
6,"[1, 2, 3, 5]"
7,"[1, 3]"
8,"[1, 4]"
9,"[2, 3, 4]"


In [34]:
#Let's add it with 1 whenever we found every candidate is a subset from Database D


def count_support(database_dataframe, prev_candidate_list):
    #initial_df_candidate['sup'] = 0 #set All value into 0 only for initial value for consistency value when running this cell everytime.
    count_prev_candidate = []

    #Set the Initial value of Previous Candidate
    for i in range(len(prev_candidate_list)):
        count_prev_candidate.append((prev_candidate_list[i], 0))
    
    df_candidate = pd.DataFrame(count_prev_candidate, columns=['itemset', 'sup'])
    print('Database D dataframe\n', database_dataframe)
    print('(Initial) Dataframe from Candidate with All zeros sup\n', df_candidate)
    
    for i in range(len(database_dataframe)):
        for j in range(len(count_prev_candidate)):
            #using issubset() function to check whether every itemset is a subset of Database or not
            if (df_candidate['itemset'][j]).issubset(set(database_dataframe['items'][i])): 
                df_candidate.loc[j, 'sup'] += 1
            
    return df_candidate

In [35]:
count_candidate2_df = count_support(df, candidate2_list)

Database D dataframe
           items
0        [1, 2]
1     [1, 2, 3]
2  [1, 2, 3, 4]
3     [1, 2, 5]
4     [1, 3, 5]
5        [4, 5]
6  [1, 2, 3, 5]
7        [1, 3]
8        [1, 4]
9     [2, 3, 4]
(Initial) Dataframe from Candidate with All zeros sup
 Empty DataFrame
Columns: [itemset, sup]
Index: []


In [36]:
count_candidate2_df

Unnamed: 0,itemset,sup


# Creating Second Frequent Itemset (L2)

In [38]:
#Filter the itemset based on minimum support (occurences of items)
freq_itemset2 = filter_sup(count_candidate2_df)

In [39]:
freq_itemset2

Unnamed: 0,itemset,sup


In [40]:
freq_itemset2_reset = freq_itemset2.reset_index(drop=True)

In [41]:
#We need to reset the index, because need to access the index later.
freq_itemset2_reset

Unnamed: 0,itemset,sup


# Creating the Third Candidate (C3) - Using the Candidate Forming Technique

**SELF JOIN**

In [42]:
print(freq_itemset2_reset)
self_join_result = self_join(freq_itemset2_reset)
print('self join result')
print(self_join_result)

Empty DataFrame
Columns: [itemset, sup]
Index: []
self join result
[]


**PRUNING**

In [43]:
def get_subset(candidate):
    temp = []
    final = []
    for i in range(len(candidate)):
        for j in range(len(candidate)):
            if i != j:
                temp.append(candidate[j])
        temp_set = set(temp)
        final.append(temp_set)
        temp.clear()
    print('Subset from {} : {}'.format(candidate, final))
    return final

def pruning(candidate_set, prev_freq_itemset):
    print('Candidate set', candidate_set)
    temp = []
    
    for idx, value in enumerate(candidate_set):
        list_candidate = list(value)
        temp_candidate = (get_subset(list_candidate))
        
        for temp_item in temp_candidate:
            print('Temp item', temp_item)
            check = temp_item == prev_freq_itemset['itemset']
            print('\nCheck candidate from Previous Frequent Itemset\n', check)
            
            if any(check) == False:
                print(any(check))
                print('Val', value)
            else:
                print('\nAll of {} subset contained in \n{}'.format(candidate_set, prev_freq_itemset))
                if value not in temp:
                    temp.append(value)
                
    return temp

In [44]:
freq_itemset2_reset

Unnamed: 0,itemset,sup


In [45]:
subset = [{2, 3}, {1, 3}, {1, 2}]

In [46]:
self_join_result

[]

In [47]:
for i in range(len(self_join_result)):
    get_subset(list(self_join_result[i]))

In [48]:
freq_itemset2_reset

Unnamed: 0,itemset,sup


In [49]:
for item in subset:
    print(item)
    check = item == freq_itemset2_reset['itemset']
    print('Check', any(check))

{2, 3}
Check False
{1, 3}
Check False
{1, 2}
Check False


In [50]:
self_join_result

[]

In [51]:
candidate3_list = pruning(self_join_result, freq_itemset2_reset)

Candidate set []


In [52]:
candidate3_list

[]

# Creating the Third Frequent Itemset (L3)

In [53]:
#Let's see the database again
df

Unnamed: 0,items
0,"[1, 2]"
1,"[1, 2, 3]"
2,"[1, 2, 3, 4]"
3,"[1, 2, 5]"
4,"[1, 3, 5]"
5,"[4, 5]"
6,"[1, 2, 3, 5]"
7,"[1, 3]"
8,"[1, 4]"
9,"[2, 3, 4]"


In [54]:
#Then check the newest candidate value
candidate3_list

[]

In [55]:
count_candidate3_df = count_support(df, candidate3_list)

Database D dataframe
           items
0        [1, 2]
1     [1, 2, 3]
2  [1, 2, 3, 4]
3     [1, 2, 5]
4     [1, 3, 5]
5        [4, 5]
6  [1, 2, 3, 5]
7        [1, 3]
8        [1, 4]
9     [2, 3, 4]
(Initial) Dataframe from Candidate with All zeros sup
 Empty DataFrame
Columns: [itemset, sup]
Index: []


In [56]:
count_candidate3_df

Unnamed: 0,itemset,sup


In [57]:
freq_itemset3 = filter_sup(count_candidate3_df)

In [58]:
freq_itemset3

Unnamed: 0,itemset,sup


#All Frequent Itemset

In [59]:
#Let'see each frequent itemset (L)
freq_itemset1

Unnamed: 0,itemset,sup
0,1,8


In [60]:
freq_itemset2

Unnamed: 0,itemset,sup


In [61]:
freq_itemset3

Unnamed: 0,itemset,sup


In [62]:
frequent_itemset = pd.concat([freq_itemset1, freq_itemset2, freq_itemset3], axis=0)

In [63]:
frequent_itemset

Unnamed: 0,itemset,sup
0,1,8


In [64]:
#Reset the index
frequent_itemset_final = frequent_itemset.reset_index(drop=True)

#Final Output of Freq. Itemset (L1-L3)

In [65]:
frequent_itemset_final

Unnamed: 0,itemset,sup
0,1,8
