# Set Membership Coursework

In [30]:
import random
import string
import timeit
from bitarray import bitarray

## 1. Implementeation

### Sequential Search 

In [58]:
class SequentialSearchSet():
    
    def __init__(self):
        self.items = []           
        
    def insertElement(self, item):
        self.items.append(item)
        return True   

    def searchElement(self, item):
        found = False     
        for item1 in self.items:
            if item1 == item:
                found = True
            else:
                found = False
        return found   

Time Complexity: 
- Insert an element is O(n^2)
- Search an element is O(n)

Space Complexity: 
- O(n)

### Binary Search Tree

In [32]:
class Node:
    
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

    def get(self, element):
        if self.value == element:
            return True
        elif element < self.value and self.left:
            return self.left.get(element)
        elif element > self.value and self.right:
            return self.right.get(element)
        return False

    def put(self, element):
        if self.value == None:
            self.value = element
            return True
        elif element == self.value:
            self.value = element
            return True
        elif element < self.value:
            if self.left is None:
                self.left = Node(element)
                return True
            else: 
                self.left.put(element)
        elif element > self.value:
            if self.right is None:
                self.right = Node(element)
                return True
            else: 
                self.right.put(element)
        return False

class BinarySearchTreeSet:
    
    def __init__(self):
        self.bst = Node(None)

    def insertElement(self, element):
        inserted = False
        inserted = self.bst.put(element) 
        return inserted

    def searchElement(self, element):     
        found = False 
        found = self.bst.get(element)
        return found

### Balanced Search Tree

In [33]:
# Red-Black Search Tree

class BalancedSearchTreeSet:
    R = True
    B = False
    # R refers to Red; B refers to Black
    
    class Nodes:
        def __init__(self,val,colour):
            self.val = val
            self.colour = colour
            self.left = None
            self.right = None
            
    def __init__(self):
        self.root = None
        
    def searchElement(self,val):
        return self.search(self.root,val)
    
    def search(self,node,val):
        if node is None:
            return False
        else:
            if node.val == val:
                return True
            elif node.val > val:
                return self.search(node.left,val)
            else:
                return self.search(node.right,val)
    
    def insertElement(self, val):
        self.root = self.insert(self.root, val)
        self.root.colour = self.B


    def insert(self, node, val):
        if node is None:
            return self.Nodes(val, self.R)

        if val < node.val:
            node.left = self.insert(node.left, val)
        elif val > node.val:
            node.right = self.insert(node.right, val)
        else:
            # value already exists, do nothing
            return node

        # perform rotations and colour flips
        if self.isR(node.right) and not self.isR(node.left):
            node = self.leftRotate(node)
        if self.isR(node.left) and self.isR(node.left.left):
            node = self.rightRotate(node)
        if self.isR(node.left) and self.isR(node.right):
            self.flipcolour(node)

        return node

    def isR(self,node):
        if node is None:
            return False
        return node.colour == self.R
    
    def leftRotate(self, node):
        x = node.right
        node.right = x.left
        x.left = node
        x.colour = node.colour
        node.colour = self.R
        return x
    
    def rightRotate(self, node):
        x = node.left
        node.left = x.right
        x.right = node
        x.colour = node.colour
        node.colour = self.R
        return x
    
    def flipcolour(self, node):
        node.colour = not node.colour
        node.left.colour = self.B
        node.right.colour = self.B

Time Complexity: 
- Insert an element is O(log n)
- Search an element is O(log n)

Space Complexity: 
- O(n)

### Bloom Filter

In [34]:
class BloomFilterSet:
    
    def __init__(self):
        self.HashFunctions = None
        self.hashNum = 0
        self.size = 0
        self.bitArray = None
    
    def initVariables(self, ItemNum, FPProb, HashFunctions):
        # ItemNum : int, Number of items expected to be stored in bloom filter
        # FPProb : float, False Positive probability in decimal
        # HashFunctions: list, number of hash functions to use
        self.HashFunctions = HashFunctions
        self.hashNum = len(HashFunctions)
        # Total size of bit array
        self.size = self.getSize(ItemNum, FPProb)
        # Bit array of given size, and initialize all bits as 0 (False)
        self.bitArray = bitarray(self.size)
        self.bitArray.setall(0)

    def insertElement(self, item):
        inserted = False
        tempIndex = []
        for function in self.HashFunctions:
            index = function(item) % self.size
            self.bitArray[index] = True
            tempIndex.append(self.bitArray[index])
        if False not in tempIndex:
            inserted = True
        return inserted

    def searchElement(self, item):
        found = False
        count = 0
        for function in self.HashFunctions:
            index = function(item) % self.size
            if not self.bitArray[index] == False:
                count += 1
        if count == len(self.HashFunctions):
            found = True
        return found

    def getSize(self, n, p):
        # m = -(n * lg(p)) / (lg(2)^2)
        # n : number of items expected to be stored in filter
        # p : False Positive probability in decimal
        m = -(n * self.myLog(p)) / (self.myLog(2) ** 2)
        return int(m)
    
    def myLog(self, n):
        if n <= 0:
            raise ValueError("logarithm is undefined for non-positive values")
        if n == 1:
            return 0
        if n < 1:
            # log(xn) = -log(1/n) for n < 1
            return -self.myLog(1 / n)
        # Use the taylor series approximation for log(n) where |n-1| < 1
        taylor = (n - 1) / float(n + 1)
        term = taylor
        total = taylor
        num = 1
        while abs(term) > 1e-15:
            num += 2
            term *= taylor * taylor / float(num * (num - 1))
            total += term
        return 2 * total

In [36]:
def builtinHash(string):
    return hash(string)

def bernsteinHash(string):
    hashValue = 5381
    for char in string:
        hashValue = ((hashValue << 5) + hashValue) + ord(char)
    return hashValue

def fnvHash(string):
    hashValue = 2166136261
    for char in string:
        hashValue = (hashValue * 16777619) ^ ord(char)
    return hashValue

def jenkinsHash(string):
    hashValue = 0
    for char in string:
        hashValue += ord(char)
        hashValue += (hashValue << 10)
        hashValue ^= (hashValue >> 6)
    hashValue += (hashValue << 3)
    hashValue ^= (hashValue >> 11)
    hashValue += (hashValue << 15)
    return hashValue

def djb2Hash(string):
    hashValue = 5381
    for char in string:
        hashValue = ((hashValue << 5) + hashValue) + ord(char)
    return hashValue

def sdbmHash(string):
    hashValue = 0
    for char in string:
        hashValue = ord(char) + (hashValue << 6) + (hashValue << 16) - hashValue
    return hashValue

def fnv1aHash(string):
    hashValue = 2166136261
    for char in string:
        hashValue ^= ord(char)
        hashValue *= 16777619
    return hashValue

In [37]:
Alphabets = list(string.ascii_letters)
randomword = []
for a in range(10000):
    temp = ''.join(random.choice(Alphabets) for t in range(random.randint(100, 200)))
    randomword.append(temp)

functions = [builtinHash, bernsteinHash, fnvHash, jenkinsHash, 
             djb2Hash, sdbmHash, fnv1aHash]
bf = BloomFilterSet()
bf.initVariables(10000, 0.01, functions)
testnum = 0
for ws in randomword:
    if bf.searchElement(ws) == True:
        testnum += 1

print("The False-Positive-Rate: ", testnum / len(randomword))

The False-Positive-Rate:  0.0


Time Complexity: 
- Creating the Bloom filter is O(kn)
- Insert and Search an element: O(k)
- k is the number of hash functions and n is the number of elements

Space Complexity: 
- O(kb), where k is the number of hash functions and b is the bits number

## 2. Experimentally Evaluation

### Using Real Data

In [61]:
def ReadFile(filepath):
    with open(filepath) as f:
        for line in f:
            words = line.strip().split()
    return words

In [62]:
words1 = ReadFile("./data/testfiles/test1-mobydick.txt")
words2 = ReadFile("./data/testfiles/test2-warpeace.txt")
words3 = ReadFile("./data/testfiles/test3-dickens.txt")

files = {"'test1-mobydick.txt'": words1, "'test2-warpeace.txt'": words2,
         "'test3-dickens.txt'": words3}
print(len(words1), len(words2), len(words3))

209329 564236 5149661


In [63]:
wordsforsearch = []
with open("./data/testfiles/test-search.txt") as fsearch:
    for line in fsearch:
        wordsforsearch.append(line.strip())
        
len(wordsforsearch)

544

In [64]:
hashFunctions = [builtinHash, bernsteinHash, fnvHash, jenkinsHash, 
                 djb2Hash, sdbmHash, fnv1aHash]

#### The total amount of time of all insert operations

In [65]:
def InsertTime(classname, datadict):
    classlist = []
    for key in datadict.keys():
        algorithm = classname()
        InsertStart = timeit.default_timer()
        for word in datadict[key]:
            algorithm.insertElement(word)
        InsertEnd = timeit.default_timer()
        InsertTime = InsertEnd - InsertStart
        print("Insert time taken for", key, "is :", InsertTime)
        classlist.append(algorithm)
    return classlist

In [66]:
# Sequential Search
SEQs = InsertTime(SequentialSearchSet, files)

Insert time taken for 'test1-mobydick.txt' is : 0.025592100000039864
Insert time taken for 'test2-warpeace.txt' is : 0.06768009999996139
Insert time taken for 'test3-dickens.txt' is : 0.6820534000000862


In [44]:
# Binary Search Tree
BSTs = InsertTime(BinarySearchTreeSet, files)

Insert time taken for 'test1-mobydick.txt' is : 0.8620805999998993
Insert time taken for 'test2-warpeace.txt' is : 1.7369330999999875
Insert time taken for 'test3-dickens.txt' is : 15.304025599999932


In [45]:
# Balanced Search Tree
RBTs = InsertTime(BalancedSearchTreeSet, files)

Insert time taken for 'test1-mobydick.txt' is : 1.502560800000083
Insert time taken for 'test2-warpeace.txt' is : 4.058674099999962
Insert time taken for 'test3-dickens.txt' is : 39.84764289999998


In [46]:
# Bloom Filter
BFs = []
for key in files.keys():
    algorithm = BloomFilterSet()
    algorithm.initVariables(len(files[key]), 0.01, hashFunctions)
    InsertStart = timeit.default_timer()
    for word in files[key]:
        algorithm.insertElement(word)
    InsertEnd = timeit.default_timer()
    InsertTime = InsertEnd - InsertStart
    print("Insert time taken for", key, "is :", InsertTime)
    BFs.append(algorithm)

Insert time taken for 'test1-mobydick.txt' is : 1.3108581999999842
Insert time taken for 'test2-warpeace.txt' is : 3.4592388000000938
Insert time taken for 'test3-dickens.txt' is : 31.690071200000148


#### The total amount of time of search operations

In [47]:
def SearchTime(objects, searchdata):
    for obj in objects:
        SearchStart = timeit.default_timer()
        for ws in searchdata:
            obj.searchElement(ws)
        SearchEnd = timeit.default_timer()
        SearchTime = SearchEnd - SearchStart
        print("Search time taken for 'test-search.txt' in file", 
              objects.index(obj)+1, "is :", SearchTime)

In [67]:
# Sequential Search Tree
SearchTime(SEQs, wordsforsearch)

Search time taken for 'test-search.txt' in file 1 is : 3.614028499999904
Search time taken for 'test-search.txt' in file 2 is : 9.116420000000062
Search time taken for 'test-search.txt' in file 3 is : 79.34528720000003


In [49]:
# Binary Search Tree
SearchTime(BSTs, wordsforsearch)

Search time taken for 'test-search.txt' in file 1 is : 0.0016153000001395412
Search time taken for 'test-search.txt' in file 2 is : 0.002173800000036863
Search time taken for 'test-search.txt' in file 3 is : 0.0021633000001202163


In [61]:
# Balanced Search Tree
SearchTime(RBTs, wordsforsearch)

Search time taken for 'test-search.txt' in file 1 is : 0.0029900000008638017
Search time taken for 'test-search.txt' in file 2 is : 0.0023522000010416377
Search time taken for 'test-search.txt' in file 3 is : 0.00291060000017751


In [67]:
# Bloom Filter
SearchTime(BFs, wordsforsearch)

Search time taken for 'test-search.txt' in file 1 is : 0.006858199998532655
Search time taken for 'test-search.txt' in file 2 is : 0.005692299999282113
Search time taken for 'test-search.txt' in file 3 is : 0.006495800000266172


### Using Synthetic Data

### different conditions:
- Duplicate cases
- Longer String length
- Adding numbers and symbols in each string
- String list in ascending/descending order
- Missing values
- ......

In [2]:
# 1st version

class TestDataGenerator():

    def __init__(self):
            self.syntheticData = []
            
            pass           
            
    # sythetic data -- integers:
    def generateNum(self, size):
        for _ in range(size):
            item = random.randrange(0,1000000)
            self.syntheticData.append(item)
        print(self.syntheticData) # delete when it's finished
        return self.syntheticData   
    
    # sythetic data -- 4-letter string (randomly without real meanings):
    def generateString(self,size):
        letters = string.ascii_lowercase
        for letter in letters:
            for _ in range(size):
                letter1 = ''.join(random.choice(letters))
                letter2 = ''.join(random.choice(letters))
                letter3 = ''.join(random.choice(letters))
                stringGenerated = letter + letter1 + letter2 + letter3
                self.syntheticData.append(stringGenerated)
        print(self.syntheticData) # delete when it's finished
        return self.syntheticData

toGenerate = TestDataGenerator()
toSearch = TestDataGenerator()

toGenerate.generateString(1)
toSearch.generateNum(10)

['aiet', 'bbrr', 'csgv', 'dmux', 'ezat', 'fxew', 'gfhm', 'hyjd', 'iaby', 'jwzk', 'kshf', 'lwrx', 'mdrk', 'nbfd', 'ogxs', 'pgyh', 'qyvb', 'rxeo', 'sjff', 'tosj', 'ukjl', 'vhgj', 'wlgn', 'xtyg', 'yvhy', 'zchc']
[634667, 496119, 767303, 410927, 353639, 23654, 399978, 520985, 373862, 726654]


[634667, 496119, 767303, 410927, 353639, 23654, 399978, 520985, 373862, 726654]

# --------

In [48]:
rbt = BalancedSearchTreeSet()
rbt.insertElement('b')
rbt.insertElement('a')
rbt.insertElement('c')

print(rbt.searchElement('b'))
print(rbt.searchElement('a'))
print(rbt.searchElement('c'))

True
True
True


Sequential Search

In [68]:
for key in files.keys():
    seqInsertStart = timeit.default_timer()
    for word in files[key]:
        SEQs.insertElement(word)
    seqInsertEnd = timeit.default_timer()
    seqInsertTime = seqInsertEnd - seqInsertStart
    print("Insert time taken for ", key, "is : ", seqInsertTime)

AttributeError: 'list' object has no attribute 'insertElement'

Sequential Search

In [None]:
SeqSearchStart = timeit.default_timer()
for ws in wordsforsearch:
    SEQs.searchElement(ws)
SeqSearchEnd = timeit.default_timer()
SeqSearchTime = SeqSearchEnd - SeqSearchStart
print("Search time taken for 'test-search.txt' is : ", SeqSearchTime)

Binary Search Tree

In [70]:
for key in files.keys():
    binaryInsertStart = timeit.default_timer()
    for word in files[key]:
        BSTs.insertElement(word)
    binaryInsertEnd = timeit.default_timer()
    binaryInsertTime = binaryInsertEnd - binaryInsertStart
    print("Insert time taken for ", key, "is : ", binaryInsertTime)

AttributeError: 'list' object has no attribute 'insertElement'

Binary Search Tree

In [73]:
BinarySearchStart = timeit.default_timer()
for ws in wordsforsearch:
    BSTs.searchElement(ws)
BinarySearchEnd = timeit.default_timer()
BinarySearchTime = BinarySearchEnd - BinarySearchStart
print("Search time taken for 'test-search.txt' is : ", BinarySearchTime)

AttributeError: 'list' object has no attribute 'searchElement'

Balanced Search Tree

In [38]:
for key in files.keys():
    RedBlackInsertStart = timeit.default_timer()
    for word in files[key]:
        RBT.insertElement(word)
    RedBlackInsertEnd = timeit.default_timer()
    RedBlackInsertTime = RedBlackInsertEnd - RedBlackInsertStart
    print("Insert time taken for ", key, "is : ", RedBlackInsertTime)

Insert time taken for  'test1-mobydick.txt' is :  2.1321972999994614
Insert time taken for  'test2-warpeace.txt' is :  6.030556100000467
Insert time taken for  'test-search.txt' is :  60.04795980000017


Balanced Search Tree

In [42]:
RedBlackSearchStart = timeit.default_timer()
for ws in wordsforsearch:
    RBT.searchElement(ws)
RedBlackSearchEnd = timeit.default_timer()
RedBlackSearchTime = RedBlackSearchEnd - RedBlackSearchStart
print("Search time taken for 'test-search.txt' is : ", RedBlackSearchTime)

Search time taken for 'test-search.txt' is :  0.0031930999994074227


Bloom Filter

In [40]:
for key in files.keys():
    BFInsertStart = timeit.default_timer()
    for word in files[key]:
        BF.insertElement(word)
    BFInsertEnd = timeit.default_timer()
    BFInsertTime = BFInsertEnd - BFInsertStart
    print("Insert time taken for ", key, "is : ", BFInsertTime)

Insert time taken for  'test1-mobydick.txt' is :  2.006846799999039
Insert time taken for  'test2-warpeace.txt' is :  5.462229099999604
Insert time taken for  'test3-dickens.txt' is :  49.96937220000109


Bloom Filter

In [43]:
BFSearchStart = timeit.default_timer()
for word in wordsforsearch:
    BF.searchElement(word)
BFSearchEnd = timeit.default_timer()
BFSearchTime = BFSearchEnd - BFSearchStart
print("Search time taken for 'test-search.txt' is : ", BFSearchTime)

Search time taken for 'test-search.txt' is :  0.005908900000576978


In [None]:
a = './data/testfiles/test1-mobydick.txt'
b = './data/testfiles/test2-warpeace.txt'
c = './data/testfiles/test3-dickens.txt'
insertFiles = [a,b,c]

for file in insertFiles:
    seqInsertStart = timeit.default_timer()
    seqS.insertElement(file)
    seqInsertEnd = timeit.default_timer()
    seqInsertTime = seqInsertEnd - seqInsertStart
    print(seqInsertTime)

In [45]:
a = './data/testfiles/test1-mobydick.txt'
b = './data/testfiles/test2-warpeace.txt'
c = './data/testfiles/test3-dickens.txt'
d = "./data/testfiles/test-search.txt"
insertFiles = [a,b,c]
for file in insertFiles:
    seqS.insertElement(file)
    SeqSearchStart = timeit.default_timer()
    seqS.searchElement(d)
    SeqSearchEnd = timeit.default_timer()
    SeqSearchTime = SeqSearchEnd - SeqSearchStart
    print(SeqSearchTime)

False ,['zero'] is not found
94.96285220000027
False ,['zero'] is not found
106.49899989999994
False ,['zero'] is not found
183.5646872999996


In [44]:
a = './data/testfiles/test1-mobydick.txt'
b = './data/testfiles/test2-warpeace.txt'
c = './data/testfiles/test3-dickens.txt'
insertFiles = [a,b,c]

for file in insertFiles:
    BSTInsertStart = timeit.default_timer()
    rbt.insertElement(file)
    BSTInsertEnd = timeit.default_timer()
    BSTInsertTime = BSTInsertEnd - BSTInsertStart
    print(BSTInsertTime)


1.3200000012147939e-05
1.6899999991437653e-05
4.79999994240643e-06


In [None]:
def insert_file():
    a = './data/testfiles/test1-mobydick.txt'
    b = './data/testfiles/test2-warpeace.txt'
    c = './data/testfiles/test3-dickens.txt'
    insertFiles = [a,b,c]
    for file in insertFiles:
        BSTInsertStart = timeit.default_timer()
        bst.insertElement(file)
        BSTInsertEnd = timeit.default_timer()
        BSTInsertTime = BSTInsertEnd - BSTInsertStart
        print(BSTInsertTime)

def insert_real_data_test(words, bst):
    BSTInsertStart = timeit.default_timer() 
    for word in words:
        bst.insertElement(word)
    BSTInsertEnd = timeit.default_timer()
    BSTInsertTime = BSTInsertEnd - BSTInsertStart
    print("Insert time: ", BSTInsertTime)

def search_words_in_set(words, bst):
    BSTSearchStart = timeit.default_timer() 
    for word in words:
        bst.searchElement(word)
    BSTSearchEnd = timeit.default_timer()
    BSTSearchTime = BSTSearchEnd - BSTSearchStart
    print("Search time (word already in set): ", BSTSearchTime)


def search_words(words, bst):
    BSTSearchStart = timeit.default_timer() 
    for word in words:
        bst.searchElement(word)
    BSTSearchEnd = timeit.default_timer()
    BSTSearchTime = BSTSearchEnd - BSTSearchStart
    print("Search time (test-search words): ", BSTSearchTime)


bst1 = BinarySearchTreeSet()
bst2 = BinarySearchTreeSet()
bst3 = BinarySearchTreeSet()

insert_real_data_test(words, bst1)
insert_real_data_test(words2, bst2)
insert_real_data_test(words3, bst3)

search_words_in_set(words, bst1)
search_words_in_set(words2, bst2)
search_words_in_set(words3, bst3)

search_words(words, bst1)
search_words(words2, bst2)
search_words(words3, bst3)

In [46]:
a = './data/testfiles/test1-mobydick.txt'
b = './data/testfiles/test2-warpeace.txt'
c = './data/testfiles/test3-dickens.txt'
d = "./data/testfiles/test-search.txt"
insertFiles = [a,b,c]

for file in insertFiles:
    rbt.insertElement(file)
    BSTSearchStart = timeit.default_timer()
    rbt.searchElement(d)
    BSTSearchEnd = timeit.default_timer()
    BSTSearchTime = BSTSearchEnd - BSTSearchStart
    print(BSTSearchTime)

2.4000000848900527e-06
8.000001798791345e-07
7.000003279244993e-07
