In [0]:
pip install mlxtend



In [0]:
import numpy as np
import math
import pandas as pd
import os
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [0]:
os.listdir()

['.config',
 'enron_3_0.02.csv',
 'nips_3_0.5.csv',
 'document_nips.csv',
 'document_enron.csv',
 'kos_2_0.2.csv',
 'kos_2_0.2',
 'document_kos.csv',
 'sample_data']

In [0]:
PATH='' # Directory for all txt files

# $Question$ $1$

## Creating Dataset

In [0]:
def DatasetGeneration(word_directory,vocab_directory):
  df=pd.read_csv(word_directory,header=None)

  # Dropping first three values
  d,w,nnz,df=(df.iloc[0],df.iloc[0],df.iloc[0],df.drop([0,1,2],axis=0))

  # Reindexing
  df.index=range(len(df))

  # Loading Vocabulary dataset
  df_vocab=pd.read_csv(vocab_directory,header=None)

  df_vocab['wordid']=list(range(1,len(df_vocab)+1))


  df['docid']=df[0].apply(lambda x: int(str(x).split()[0]))
  df['wordid']=df[0].apply(lambda x: int(str(x).split()[1]))
  df['count']=df[0].apply(lambda x: int(str(x).split()[2]))
  df=df.drop(0,axis=1)


  data=df.merge(df_vocab,on='wordid',how='left')
  data.columns=list(data.columns[:-1])+['words']

  #data.to_csv(f"document_{word_directory.split('.')[1]}.csv",index=False)  

  return data

In [0]:
enron_words=DatasetGeneration(PATH+'docword.enron.txt',PATH+'vocab.enron.txt')
nips_words=DatasetGeneration(PATH+'docword.nips.txt',PATH+'vocab.nips.txt')
kos_words=DatasetGeneration(PATH+'docword.kos.txt',PATH+'vocab.kos.txt')

In [0]:
kos_words=pd.read_csv('document_kos.csv')
nips_words=pd.read_csv('document_nips.csv')
enron_words=pd.read_csv('document_enron.csv')

# Imputing Null Values

In [0]:
enron_words[enron_words['words'].isna()]='null'
nips_words[nips_words['words'].isna()]='null'

# Deleting all those words which is present in only one transaction

In [0]:
x=nips_words.groupby(['words']).count()
words=list(x[x['count']>1].index)
nips_words=nips_words[nips_words['words'].isin(words)]

In [0]:
x=enron_words.groupby(['words']).count()
words=list(x[x['count']>1].index)
enron_words=enron_words[enron_words['words'].isin(words)]

In [0]:
x=kos_words.groupby(['words']).count()
words=list(x[x['count']>1].index)
kos_words=kos_words[kos_words['words'].isin(words)]

# Creating Transaction Data

In [0]:
kos_doc=list(kos_words.groupby(['docid'])['words'].apply(lambda x: list(x.unique())))
nips_doc=list(nips_words.groupby(['docid'])['words'].apply(lambda x: list(x.unique())))
enron_doc=list(enron_words.groupby(['docid'])['words'].apply(lambda x: list(x.unique())))

In [0]:
def freqItemset(transaction,min_sup,itemset):
  te=TransactionEncoder()
  te_ary=te.fit(transaction).transform(transaction,sparse=True)
  document=pd.DataFrame.sparse.from_spmatrix(te_ary,columns=te.columns_)

  frequent=mlxtend.frequent_patterns.apriori(document,min_support=min_sup,use_colnames=True)
  frequent_itemsets = apriori(document, min_support=min_sup, use_colnames=True)
  frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
  return frequent_itemsets[frequent_itemsets['length']==itemset]


In [0]:
# 15 min 37s for min_sup=0.5 and itemset=3 # Nips dataset

# $Question$ $2$

## Multiple Minimum Support 

In [0]:
class multiple_minimum_support:

  """ Custom Class for MS-Apriori Algorithm
  """

  def __init__(self,dataset,alpha,lmbda,k,phi):
    self.dataset=dataset
    self.transactions=self.dataset['docid'].nunique() # No of transactions
    self.dataset_doc=self.dataset.groupby(['docid'])['words'].apply(lambda x: list(x.unique()))
    self.lmbda=lmbda # Parameter
    self.alpha=alpha
    self.k=k # integer k for k-itemset generation
    self.phi=phi # upper bound for maximum support difference

    self.vocab=self.dataset['words'].unique()
    
    self.MS=dict() # Stores all MIS values

    self.support=dict(self.dataset.groupby(['words'])['docid'].count()/self.transactions)# DIctionary of all support counts

    """for i in self.vocab:
      sup=self.dataset[self.dataset['words']==o]['docid'].nunique()/self.transactions
      self.support[i]=sup"""
    # Sorting ALl the support
    self.MS_sorted=sorted(self.support,key=self.support.get)
    

    for i in self.MS_sorted:
      self.MS[i]=self.lmbda*self.support[i]
    
    del self.MS_sorted

    self.freqDataset=pd.DataFrame({'Item':[],'Support':[],'MIS':[],'Length':[]})

    
  
  def count(self,x):
    """ 
    Support Count for itemset x

    """
    df=self.dataset[self.dataset['words'].isin(x)]
    docids=df.groupby(['docid']).count()
    docids=docids[docids['count']==len(x)]

    return len(docids)/self.transactions

  def sort(self,x):

    """ sort values according to the mis values
    """
    if self.MS[x[-2]]>self.MS[x[-1]]:
      x[-2],x[-1]=x[-1],x[-2]
    return x


  def ms_apriori(self):


    F1=[[i] for i in self.vocab if self.support[i]>=self.MS[i] and self.support[i]>self.alpha]
    S1=[self.support[i[0]] for i in F1]
    MS1=[self.MS[i[0]] for i in F1]
    L1=[1]*len(F1)

    temp=pd.DataFrame({'Item':F1,'Support':S1,'MIS':MS1,'Length':L1})
    self.freqDataset=pd.concat([self.freqDataset,temp],axis=0)

    del temp,S1,MS1,L1

    for k in range(2,self.k+1):

      if k==2:
        C2=self.level2_candidate_generation()

        F2=[i for i in C2 if self.count(i)>=self.MS[i[0]]]
        S2=[self.count(i) for i in F2]
        MS2=[self.MS[i[0]] for i in F2]
        L2=[2]*len(F2)

        temp=pd.DataFrame({'Item':F2,'Support':S2,'MIS':MS2,'Length':L2})
        self.freqDataset=pd.concat([self.freqDataset,temp],axis=0)
        del temp,S2,MS2,L2
      else:
        C=self.MScancidate_gen(self.freqDataset[self.freqDataset['Length']==(k-1)]['Item'])
        F=[self.sort(i) for i in C if self.count(i)>=self.MS[i[0]]]
        S=[self.count(i) for i in F]
        MS=[self.MS[i[0]] for i in F]
        L=[k]*(len(F))

        temp=pd.DataFrame({'Item':F,'Support':S,'MIS':MS,'Length':L})
        self.freqDataset=pd.concat([self.freqDataset,temp],axis=0)

        del temp,S,MS,L
  
  
  
  def level2_candidate_generation(self):
    C2=[]
    vocab=self.freqDataset[['Item','MIS']]
    vocab=vocab.sort_values(by='MIS')
    vocab=vocab['Item'].apply(lambda x: x[0])

    for i,l in enumerate(vocab):
      if self.support[l]>=self.MS[l]:
        for h in vocab[i+1:]:
          if self.support[h]>=self.MS[l] and abs(self.support[h]-self.support[l])<=self.phi:
            C2.append([l,h])
    return C2

  def MScancidate_gen(self,Fk):
    CK=[]
    for i,item_1 in enumerate(Fk):
      for j,item_2 in enumerate(Fk[i+1:]):
        if item_1[:-1]==item_2[:-2]:
          x=self.freqDataset[self.freqDataset['Item'].isin(item_1)]['Support']
          y=self.freqDataset[self.freqDataset['Item'].isin(item_2)]['Support']
          if abs(x-y)<=self.phi:
            c=item_1.copy()
            c.append(item_2[-1])

            if self.subset_chcek(c,Fk):
              CK.append(c)
    return CK


  def subset_chcek(self,x,y):
    """ x <- Candidate set
        y <- Dataset
    """
    not_ok=0
    y=list(y)

    for j in range(len(x)):
      for k in range(j,len(x)):
        if j==0 or self.MS[x[1]]==self.MS[x[0]]:
          if [x[j],x[k]] not in y:
            not_ok+=1
            return False
    return True

  def __repr__(self):
    return "Frequent Itemset Generation Using Multiple Minimum Support"

In [0]:
mis=multiple_minimum_support(kos_words,0.2,0.2,k=2,phi=0.8)

In [0]:
%%time
mis.ms_apriori()

CPU times: user 15.5 s, sys: 44.6 ms, total: 15.5 s
Wall time: 15.6 s
