## Frequent Itemset and Association Rule Mining using Apriori and FPGrowth Algorithms

<div style="text-align: right"> By Smit Doshi (001475186) </div>

#### Importing Required Libraries

In [1]:
import string
from spmf import Spmf
import nltk
import pandas as pd
import numpy as np
import re

#### Getting the data & reading the tweets line by line

In [2]:
raw_data = open('Dataset.txt',mode='r').readlines()
raw_data[:10]

['"There\'s no science behind the term behavioural fatigue." - Professor Robert West, member of the government\'s scientific pandemic influenza group on behaviours Watch @lewis_goodall\'s report into why the UK didn\'t lockdown sooner https://bbc.in/2X9WJ5O #Newsnight\n',
 'Bare tse di woke di tsena mo lockdown....\n',
 'My boss o ntse rude hela mo nna ke ipotsa a nne o akanya gore ke nna Dr Masupu nne? (Lockdown stay home)\n',
 'Thank you for this prize @realmemobiles @MadhavSheth1 I am glad the team had organised a @PUBG tournament in this lockdown And this journey from qualifiers to winning the finals was great fun , and i was overwhelmed on winning these Wish to see these type games again \n',
 'Are we out of lockdown\n',
 '#NowPlaying @originalkoffee - LOCKDOWN @RobboRanx @RobboRanxRadio #Dancehall360\n',
 'Christians Get Around Newsom Church Lockdown: Over 5,000 Show Up on CA Beach To Worship \n',
 "it took a lot of time for me to adjust to this lifestyle and I don't think I can 

#### Cleaning the dataset 
##### Removing all punctuations from the end of the text

In [3]:
results = []

#punc = '!"$%&\'()*+,-.;<=>?[\\]^_`{|}~'

for line in raw_data:
    line = line.lower()
    line = line.replace('\n','')
    line = line.replace('.','')
    line = line.encode('ascii',errors='ignore')
    line = line.decode()
    line = line.replace('  ','')
    
    try:
         while line[-1] not in string.ascii_letters:
                try:
                    line = line.rstrip(" ")
                except:
                    pass
                try:
                    line = line.rstrip(string.punctuation)
                except:
                    pass
                try:
                    line = line.rstrip(string.digits)
                except:
                    pass
    except:
        pass
    results.append(line)

In [4]:
results[:10]

['"there\'s no science behind the term behavioural fatigue" - professor robert west, member of the government\'s scientific pandemic influenza group on behaviours watch @lewis_goodall\'s report into why the uk didn\'t lockdown sooner https://bbcin/2x9wj5o #newsnight',
 'bare tse di woke di tsena mo lockdown',
 'my boss o ntse rude hela mo nna ke ipotsa a nne o akanya gore ke nna dr masupu nne? (lockdown stay home',
 'thank you for this prize @realmemobiles @madhavsheth1 i am glad the team had organised a @pubg tournament in this lockdown and this journey from qualifiers to winning the finals was great fun , and i was overwhelmed on winning these wish to see these type games again',
 'are we out of lockdown',
 '#nowplaying @originalkoffee - lockdown @robboranx @robboranxradio #dancehall',
 'christians get around newsom church lockdown: over 5,000 show up on ca beach to worship',
 "it took a lot of time for me to adjust to this lifestyle and i don't think i can suddenly switch back to wh

#### Building the dictionary of all the unique words used in our dataset

In [5]:
final_dict = {}
m = 0
for i in range(0,len(results)):
    for word in results[i].split(" "):
        if word not in final_dict.keys():
            final_dict[word] = m + 1
            m = m + 1

#### Number of unique words in our dictionary

In [6]:
len(final_dict)

175662

In [7]:
list(final_dict.items())[:20]

[('"there\'s', 1),
 ('no', 2),
 ('science', 3),
 ('behind', 4),
 ('the', 5),
 ('term', 6),
 ('behavioural', 7),
 ('fatigue"', 8),
 ('-', 9),
 ('professor', 10),
 ('robert', 11),
 ('west,', 12),
 ('member', 13),
 ('of', 14),
 ("government's", 15),
 ('scientific', 16),
 ('pandemic', 17),
 ('influenza', 18),
 ('group', 19),
 ('on', 20)]

#### Replacing words with their corresponding number from the dictionary

In [8]:
output_list = []

for i in range(0,len(results)):
    line_list=[]
    for word in results[i].split(" "):
        line_list.append(final_dict[word])
        line_list = list(set(line_list))
        line_list.sort()
    output_list.append(' '.join([str(elem) for elem in line_list]))

In [9]:
output_list[:10]

['1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32',
 '29 33 34 35 36 37 38',
 '38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57',
 '5 20 29 48 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91',
 '14 29 92 93 94',
 '9 29 95 96 97 98 99',
 '20 78 100 101 102 103 104 105 106 107 108 109 110 111 112',
 '14 29 48 60 61 65 74 78 81 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128',
 '29 129 130',
 '5 14 29 48 73 74 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160']

#### Writing it to the text file 

In [10]:
new_data = open(r"new_dataset.txt","w")
for line in output_list:
    new_data.write(line + str('\n'))
new_data.close()

In [11]:
def GetKey(val,dictionary = final_dict):
    for key, value in dictionary.items():
        if val == value:
             return key
    return "key doesn't exist"

#### Running the Apriori Algorithm

In [12]:
apriori = Spmf(algorithm_name='Apriori',arguments=[0.3],input_filename='new_dataset.txt',
               output_filename='apriori_output.txt',spmf_bin_location_dir='/Users/smitdoshi/Downloads/')
apriori.run()

>/Users/smitdoshi/Downloads/spmf.jar
 Candidates count : 28
 The algorithm stopped at size 3
 Frequent itemsets count : 9
 Maximum memory usage : 80.13859558105469 mb
 Total time ~ 503 ms



#### Here is the dataframe that shows the words having Support more than 30 percent

In [13]:
df = pd.read_csv('apriori_output.txt', sep='#',header=None)
sup = df[1].str.split(":", n = 1, expand = True)
df['Support'] =sup[1]
df.rename(columns = {0:'Words'}, inplace = True)
df.drop(1,axis=1,inplace=True)
df.Support = df.Support.astype(int)
df.Words = df.Words.str.rstrip()
df.Words = df.Words.str.replace(" ",",")
for index,word in df.Words.items():
    if ',' in word:
        df.Words[index] = GetKey(int(word.split(',')[0])) + "," + GetKey(int(word.split(',')[1]))
    else:
        df.Words[index] = GetKey(int(word))
df = df.sort_values('Support',ascending=False,ignore_index=True)
df[:5]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Words[index] = GetKey(int(word))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Words[index] = GetKey(int(word.split(',')[0])) + "," + GetKey(int(word.split(',')[1]))


Unnamed: 0,Words,Support
0,lockdown,78386
1,the,48079
2,"the,lockdown",40027
3,to,39495
4,in,33683


#### Running the FPGrowth Algorithm

In [14]:
fpgrowth = Spmf(algorithm_name='FPGrowth_association_rules_with_lift',arguments=[0.4,0,1],
                input_filename='new_dataset.txt',output_filename='fpgrowth_output.txt',spmf_bin_location_dir='/Users/smitdoshi/Downloads/')
fpgrowth.run()

>/Users/smitdoshi/Downloads/spmf.jar
 Transactions count from database : 95488
 Max memory usage: 85.37606811523438 mb 
 Frequent itemsets count : 4
 Total time ~ 576 ms
 Number of association rules generated : 2
 Total time ~ 0 ms



In [15]:
fpgrowth = Spmf(algorithm_name='FPGrowth_association_rules_with_lift',arguments=[0.25,0,1],
                input_filename='new_dataset.txt',output_filename='fpgrowth_output.txt',spmf_bin_location_dir='/Users/smitdoshi/Downloads/')
fpgrowth.run()

>/Users/smitdoshi/Downloads/spmf.jar
 Transactions count from database : 95488
 Max memory usage: 84.15547180175781 mb 
 Frequent itemsets count : 14
 Total time ~ 644 ms
 Number of association rules generated : 12
 Total time ~ 1 ms



#### As we can see here, for 40% support we can only get 2 rules - so, I tried it with 25% support and was able to get more rules, as shown below:

In [16]:
fp = pd.read_csv('fpgrowth_output.txt', sep='#',header=None)
sup = fp[1].str.split(":", n = 1, expand = True)
lift = fp[3].str.split(":", n = 1, expand = True)
fp['Support'] = sup[1]
fp['Lift'] = lift[1]
fp.rename(columns = {0:'Rules'}, inplace = True)
fp.drop(columns=[1,2,3],axis=1,inplace=True)
fp.Rules = fp.Rules.str.strip()
fp.Support = fp.Support.str.strip()
fp.Lift = fp.Lift.str.strip()
fp.Rules = fp.Rules.str.replace(" ==> ","~")
fp.Rules = fp.Rules.str.replace(" ",",")
fp.Lift = fp.Lift.astype(float)
fp.Support = fp.Support.astype(int)
fp.Lift = np.round(fp.Lift,1)
fp = fp.sort_values('Support',ascending=False,ignore_index=True)
fp['Words'] = ''
for index,rule in fp.Rules.iteritems():
    empty = ''
    for number in re.split('~|,',rule):
        empty = empty + "," + GetKey(int(number))
    fp['Words'][index] = empty
fp.Words = fp.Words.str.lstrip(',')
fp = fp[['Rules', 'Words', 'Support', 'Lift']]
fp.Rules = fp.Rules.str.replace("~"," ==> ")
fp[fp['Lift']== 1.0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fp['Words'][index] = empty


Unnamed: 0,Rules,Words,Support,Lift
0,29 ==> 5,"lockdown,the",40027,1.0
1,5 ==> 29,"the,lockdown",40027,1.0
2,78 ==> 29,"to,lockdown",32555,1.0
3,29 ==> 78,"lockdown,to",32555,1.0
4,73 ==> 29,"in,lockdown",27980,1.0
5,29 ==> 73,"lockdown,in",27980,1.0
8,74 ==> 29,"and,lockdown",25837,1.0
9,29 ==> 74,"lockdown,and",25837,1.0
10,48 ==> 29,"a,lockdown",25690,1.0
11,29 ==> 48,"lockdown,a",25690,1.0
