# Set Membership Coursework

In [None]:
import random
import string
import timeit
from bitarray import bitarray

## 1. Implementeation

### Sequential Search 

In [None]:
class SequentialSearchSet():
    
    def __init__(self):
        self.items = []           
        
    def insertElement(self, element):
        self.items.append(element)
        return True   

    def searchElement(self, element):
        found = False     
        for item1 in self.items:
            if item1 == element:
                found = True
            else:
                found = False
        return found   

Time Complexity: 
- Insert an element is O(n^2)
- Search an element is O(n)

Space Complexity: 
- O(n)

### Binary Search Tree

In [None]:
class Node:
    
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

    def get(self, element):
        if self.value == element:
            return True
        elif element < self.value and self.left:
            return self.left.get(element)
        elif element > self.value and self.right:
            return self.right.get(element)
        return False

    def put(self, element):
        if self.value == None:
            self.value = element
            return True
        elif element == self.value:
            self.value = element
            return True
        elif element < self.value:
            if self.left is None:
                self.left = Node(element)
                return True
            else: 
                return self.left.put(element)
        elif element > self.value:
            if self.right is None:
                self.right = Node(element)
                return True
            else: 
                return self.right.put(element)
        return False

class BinarySearchTreeSet:
    
    def __init__(self):
        self.bst = Node(None)

    def insertElement(self, element):
        inserted = False
        inserted = self.bst.put(element) 
        return inserted

    def searchElement(self, element):     
        found = False 
        found = self.bst.get(element)
        return found

**Time Complexity:**

Insert:
- Average: O(log n)
- Worst: O(n)

Search:
- Average: O(log n)
- Worst: O(n)

**Space Complexity:**

O(n)

### Red-Black Search Tree

In [None]:
class BalancedSearchTreeSet:
    R = True
    B = False
    # R refers to Red; B refers to Black
    
    class Nodes:
        def __init__(self, val, colour):
            self.val = val
            self.colour = colour
            self.left = None
            self.right = None
            
    def __init__(self):
        self.root = None
        
    def searchElement(self, element):
        return self.search(self.root, element)
    
    def search(self, node, val):
        if node is None:
            return False
        else:
            if node.val == val:
                return True
            elif node.val > val:
                return self.search(node.left, val)
            else:
                return self.search(node.right, val)
    
    def insertElement(self, element):
        if self.root is None:
            self.root = self.Nodes(element, self.B)
            return True
        else:
            self.root, inserted = self.insert(self.root, element)
            self.root.colour = self.B
            return inserted

    def insert(self, node, val):
        if node is None:
            return self.Nodes(val, self.R), True
        if val < node.val:
            node.left, inserted = self.insert(node.left, val)
        elif val > node.val:
            node.right, inserted = self.insert(node.right, val)
        else:
            # value already exists, do nothing
            return node, False

        # perform rotations and colour flips
        if self.isR(node.right) and not self.isR(node.left):
            node = self.leftRotate(node)
        if self.isR(node.left) and self.isR(node.left.left):
            node = self.rightRotate(node)
        if self.isR(node.left) and self.isR(node.right):
            self.flipColour(node)

        return node, inserted

    def isR(self, node):
        if node is None:
            return False
        return node.colour == self.R
    
    def leftRotate(self, node):
        x = node.right
        node.right = x.left
        x.left = node
        x.colour = node.colour
        node.colour = self.R
        return x
    
    def rightRotate(self, node):
        x = node.left
        node.left = x.right
        x.right = node
        x.colour = node.colour
        node.colour = self.R
        return x
    
    def flipColour(self, node):
        node.colour = not node.colour
        node.left.colour = self.B
        node.right.colour = self.B

Time Complexity: 

- Insert an element is O(log n)
- Search an element is O(log n)

Space Complexity: 

- O(n)

### 2-3 Search Tree

In [None]:
class Node:
    def __init__(self, values, parent = None):
        self.values = [values]
        self.parent = parent
        self.child = []
    
    def is_leaf(self):
        return len(self.child) == 0
    
    def __lt__(self, other):
        return self.values[0] < other.values[0]
    
    def add(self, new_node):
        for child in new_node.child:
            child.parent = self
        self.values.extend(new_node.values)
        self.values.sort()
        self.child.extend(new_node.child)
        if len(self.child)>1:
            self.child.sort()
        if len(self.values)>2:
            self.split()
    
    def _insert(self, new_node):
        if self.is_leaf() == True:
            if new_node.values[0] not in self.values:
                self.add(new_node)
                return True
            return False
        elif new_node.values[0] > self.values[-1]:
            return self.child[-1]._insert(new_node)
        else:
            for i in range(0, len(self.values)):
                if new_node.values[0] < self.values[i]:
                    return self.child[i]._insert(new_node)
                    
    
    def split(self):
        left_child = Node(self.values[0], self)
        right_child = Node(self.values[2], self)
        if len(self.child) != 0:
            self.child[0].parent = left_child
            self.child[1].parent = left_child
            self.child[2].parent = right_child
            self.child[3].parent = right_child
            left_child.child = [self.child[0], self.child[1]]
            right_child.child = [self.child[2], self.child[3]]
        
        #promote middle node
        self.child = [left_child]
        self.child.append(right_child)
        self.values = [self.values[1]]

        if self.parent is not None:
            if self in self.parent.child:
                self.parent.child.remove(self)
            self.parent.add(self)
        else:
            left_child.parent = self
            right_child.parent = self

    def _find(self, element):
        if element in self.values:
            return True
        elif self.is_leaf() == True:
            return False
        elif element > self.values[-1]:
            return self.child[-1]._find(element)
        else:
            for i in range (0, len(self.values)):
                if element < self.values[i]:
                    return self.child[i]._find(element)
        
class TwoThreeBST:
    def __init__(self):
        self.root = None
    
    def insertElement(self, element):
        new_node = Node(element)
        if self.root is None:
            self.root = new_node
        else:
            inserted = self.root._insert(new_node)
            if not inserted:
                return False
            #re-establishes the root to the top of the tree
            while self.root.parent is not None:
                self.root = self.root.parent
        return True 
    
    def searchElement(self, element):
        return self.root._find(element)

Time Complexity: 

- Insert an element is O(log n)
- Search an element is O(log n)

Space Complexity: 

- O(n)

### Bloom Filter

In [None]:
class BloomFilterSet:
    
    def __init__(self):
        self.HashFunctions = None
        self.hashNum = 0
        self.size = 0
        self.bitArray = None
    
    def initVariables(self, ItemNum, FPProb, HashFunctions):
        # ItemNum : int, Number of items expected to be stored in bloom filter
        # FPProb : float, False Positive probability in decimal
        # HashFunctions: list, number of hash functions to use
        self.HashFunctions = HashFunctions
        self.hashNum = len(HashFunctions)
        # Total size of bit array
        self.size = self.getSize(ItemNum, FPProb)
        # Bit array of given size, and initialize all bits as 0 (False)
        self.bitArray = bitarray(self.size)
        self.bitArray.setall(0)

    def insertElement(self, element):
        inserted = False
        tempIndex = []
        for function in self.HashFunctions:
            index = function(element) % self.size
            self.bitArray[index] = True
            tempIndex.append(self.bitArray[index])
        if False not in tempIndex:
            inserted = True
        return inserted

    def searchElement(self, element):
        found = False
        count = 0
        for function in self.HashFunctions:
            index = function(element) % self.size
            if not self.bitArray[index] == False:
                count += 1
        if count == len(self.HashFunctions):
            found = True
        return found

    def getSize(self, n, p):
        # m = -(n * lg(p)) / (lg(2)^2)
        # n : number of items expected to be stored in filter
        # p : False Positive probability in decimal
        m = -(n * self.myLog(p)) / (self.myLog(2) ** 2)
        return int(m)
    
    def myLog(self, n):
        if n <= 0:
            raise ValueError("logarithm is undefined for non-positive values")
        if n == 1:
            return 0
        if n < 1:
            # log(xn) = -log(1/n) for n < 1
            return -self.myLog(1 / n)
        # Use the taylor series approximation for log(n) where |n-1| < 1
        taylor = (n - 1) / float(n + 1)
        term = taylor
        total = taylor
        num = 1
        while abs(term) > 1e-15:
            num += 2
            term *= taylor * taylor / float(num * (num - 1))
            total += term
        return 2 * total

In [None]:
def builtinHash(string):
    return hash(string)

def bernsteinHash(string):
    hashValue = 5381
    for char in string:
        hashValue = ((hashValue << 5) + hashValue) + ord(char)
    return hashValue

def fnvHash(string):
    hashValue = 2166136261
    for char in string:
        hashValue = (hashValue * 16777619) ^ ord(char)
    return hashValue

def jenkinsHash(string):
    hashValue = 0
    for char in string:
        hashValue += ord(char)
        hashValue += (hashValue << 10)
        hashValue ^= (hashValue >> 6)
    hashValue += (hashValue << 3)
    hashValue ^= (hashValue >> 11)
    hashValue += (hashValue << 15)
    return hashValue

def djb2Hash(string):
    hashValue = 5381
    for char in string:
        hashValue = ((hashValue << 5) + hashValue) + ord(char)
    return hashValue

def sdbmHash(string):
    hashValue = 0
    for char in string:
        hashValue = ord(char) + (hashValue << 6) + (hashValue << 16) - hashValue
    return hashValue

def fnv1aHash(string):
    hashValue = 2166136261
    for char in string:
        hashValue ^= ord(char)
        hashValue *= 16777619
    return hashValue

In [None]:
Alphabets = list(string.ascii_letters)
randomword = []
for a in range(10000):
    temp = ''.join(random.choice(Alphabets) for t in range(random.randint(100, 200)))
    randomword.append(temp)

functions = [builtinHash, bernsteinHash, fnvHash, jenkinsHash, 
             djb2Hash, sdbmHash, fnv1aHash]
bf = BloomFilterSet()
bf.initVariables(10000, 0.01, functions)
testnum = 0
for ws in randomword:
    if bf.searchElement(ws) == True:
        testnum += 1

print("The False-Positive-Rate: ", testnum / len(randomword))

Time Complexity: 
- Creating the Bloom filter is O(kn)
- Insert and Search an element: O(k)
- k is the number of hash functions and n is the number of elements

Space Complexity: 
- O(kb), where k is the number of hash functions and b is the bits number

## 2. Experimentally Evaluation

### Using Real Data

In [None]:
def ReadFile(filepath):
    with open(filepath) as f:
        for line in f:
            words = line.strip().split()
    return words

In [None]:
words1 = ReadFile("./data/testfiles/test1-mobydick.txt")
words2 = ReadFile("./data/testfiles/test2-warpeace.txt")
words3 = ReadFile("./data/testfiles/test3-dickens.txt")

files = {"'test1-mobydick.txt'": words1, "'test2-warpeace.txt'": words2,
         "'test3-dickens.txt'": words3}
print(len(words1), len(words2), len(words3))

In [None]:
wordsforsearch = []
with open("./data/testfiles/test-search.txt") as fsearch:
    for line in fsearch:
        wordsforsearch.append(line.strip())
        
len(wordsforsearch)

In [None]:
hashFunctions = [builtinHash, bernsteinHash, fnvHash, jenkinsHash, 
                 djb2Hash, sdbmHash, fnv1aHash]

#### The total amount of time of all insert operations

In [None]:
# def InsertTime(classname, datadict):
#     res = []
#     classlist = []
#     for key in datadict.keys():
#         tempdata = datadict[key]
#         algorithm = classname()
#         if str(classname) == str(BloomFilterSet):
#             algorithm.initVariables(len(tempdata), 0.01, hashFunctions)
        
#         seen = {}
#         InsertStart = timeit.default_timer()
#         for word in tempdata:
#             if word is not None and word not in seen:
#                 seen[word] = True
#                 inserted = algorithm.insertElement(word)
#                 res.append(inserted)
#         InsertEnd = timeit.default_timer()
#         InsertTime = InsertEnd - InsertStart
#         print("Insert time taken for", key, "is :", InsertTime)
#         classlist.append(algorithm)
#         return classlist, res
    
def InsertTime(classname, datadict):
    classlist = []
    ts = []
    for key in datadict.keys():
        algorithm = classname()
        InsertStart = timeit.default_timer()
        
        for word in datadict[key]:
            t = algorithm.insertElement(word)
            ts.append(t)
        InsertEnd = timeit.default_timer()
        InsertTime = InsertEnd - InsertStart
        print("Insert time taken for", key, "is :", InsertTime)
        classlist.append(algorithm)
    return classlist, ts

In [None]:
# Sequential Search
SEQs, res = InsertTime(SequentialSearchSet, files)
res

In [None]:
# Binary Search Tree
BSTs, res = InsertTime(BinarySearchTreeSet, files)
res

In [None]:
# Balanced Search Tree
RBTs, res = InsertTime(BalancedSearchTreeSet, files)
res

In [None]:
TTBSTs, res = InsertTime(TwoThreeBST, files)
res

In [None]:
# Bloom Filter
BFs, res = InsertTime(BloomFilterSet, files)
res

In [None]:

BFs = []
for key in files.keys():
    algorithm = BloomFilterSet()
    algorithm.initVariables(len(files[key]), 0.01, hashFunctions)
    InsertStart = timeit.default_timer()
    for word in files[key]:
        algorithm.insertElement(word)
    InsertEnd = timeit.default_timer()
    InsertTime = InsertEnd - InsertStart
    print("Insert time taken for", key, "is :", InsertTime)
    BFs.append(algorithm)

#### The total amount of time of search operations

In [None]:
def SearchTime(objects, searchdata):
    for obj in objects:
        SearchStart = timeit.default_timer()
        for ws in searchdata:
            obj.searchElement(ws)
        SearchEnd = timeit.default_timer()
        SearchTime = SearchEnd - SearchStart
        print("Search time taken for 'test-search.txt' in file", 
              objects.index(obj)+1, "is :", SearchTime)

In [None]:
# Sequential Search Tree
SearchTime(SEQs, wordsforsearch)

In [None]:
# Binary Search Tree
SearchTime(BSTs, wordsforsearch)

In [None]:
# Red-Black Balanced Search Tree
SearchTime(RBTs, wordsforsearch)

In [None]:
# 2-3 Balanced Search Tree
SearchTime(TTBSTs, wordsforsearch)

In [None]:
# Bloom Filter
SearchTime(BFs, wordsforsearch)

### Using Synthetic Data

### different conditions:
- Duplicate cases
- Longer String length
- Adding numbers and symbols in each string
- String list in ascending/descending order
- Missing values
- ......

In [None]:
class TestDataGenerator():

    def __init__(self):
        self.syntheticData = []
        self.strLen = 0
        self.percentage = 0.0

    def setParams(self, strLen, percentage):
        self.strLen = strLen
        self.percentage = percentage
    
    def generateData(self, size):
        data = []
        alphabets = list(string.ascii_letters)
        charlist = string.printable

#         # Normal Case
#         data = [''.join(random.choice(alphabets) for _ in range(random.randint(5, 10))) for _ in range(size)]

#         # Long String   
#         data1 = [''.join(random.choice(alphabets) for _ in range(self.strLen)) for _ in range(size)]

#         # Special String
#         data2 = [''.join(random.sample(charlist, random.randint(5, 10))) for _ in range(size)]

        # Normal Case
        while len(data) < size:
            temp = ''.join(random.choice(alphabets) for _ in range(random.randint(4, 10)))
            if temp not in data:
                data.append(temp)

        # Long String
        data1 = []
        while len(data1) < size:
            temp1 = ''.join(random.choice(alphabets) for _ in range(self.strLen))
            if temp1 not in data1:
                data1.append(temp1)

        # Special String
        data2 = []
        while len(data2) < size:
            temp2 = ''.join(random.sample(charlist, random.randint(4, 10)))
            if temp2 not in data2:
                data2.append(temp2)

        # Ascending Order
        data3 = self.mergeSort(data.copy())

        # String with missing values and duplicate values
        num6 = round(size * 0.2)
        temp6 = data[ : num6]
        data5 = [None if i >= num6 and random.random() < self.percentage else data[i] for i in range(size)]
        data6 = [random.choice(temp6) if i >= num6 and random.random() < self.percentage else data[i] for i in range(size)]

        # Descending Order
        data4 = data3[::-1]

        self.syntheticData.append(data)
        self.syntheticData.append(data1)
        self.syntheticData.append(data2)
        self.syntheticData.append(data3)
        self.syntheticData.append(data4)
        self.syntheticData.append(data5)
        self.syntheticData.append(data6)
        return self.syntheticData

    
    def generateData2(self, strLen, percentage):
#         data, data1, data2 = [""] * self.size, [""] * self.size, [""] * self.size
        data, data1, data2 = [], [], []
        charlist = string.printable
#         i, j, k = 0, 0, 0
        while len(data) < self.size and len(data1) < self.size and len(data2) < self.size:
            # Normal Case
            temp = ''.join(random.choice(self.alphabets) for _ in range(random.randint(5, 10)))
            if temp not in data:
                data.append(temp)
            # Long String   
            temp1 = ''.join(random.choice(self.alphabets) for _ in range(strLen))
            if temp1 not in data1:
                data1.append(temp1)
            # Special String
            temp2 = ''.join(random.choice(charlist) for _ in range(random.randint(5, 10)))
            if temp2 not in data2:
                data2.append(temp2)
        print(0,1,2)
#         for i in range(self.size):
#             # Normal Case
#             data[i] = ''.join(random.choice(self.alphabets) for _ in range(random.randint(2, 15)))
#             # Long String   
#             data1[i] = ''.join(random.choice(self.alphabets) for _ in range(strLen))
#             # Special String
#             data2[i] = ''.join(random.choice(charlist) for _ in range(random.randint(2, 15)))
            
        # Ascending Order
        data3 = self.mergeSort(data.copy())
        # Descending Order
        data4 = data3[::-1]
        print(3,4)
        data5, data6 = data.copy(), data.copy()
        # String with missing values
        num6 = round(self.size * 0.1)
        temp6 = data6[ : num6]
        for _ in range(round(percentage * self.size)):
            index = random.choice(range(num6, self.size))
            # String with missing values
            data5[index] = None
            # String with duplicate values
            data6[index] = random.choice(temp6)
        
        self.syntheticData.append(data)
        self.syntheticData.append(data1)
        self.syntheticData.append(data2)
        self.syntheticData.append(data3)
        self.syntheticData.append(data4)
        self.syntheticData.append(data5)
        self.syntheticData.append(data6)
        return self.syntheticData
    
    def mergeSort(self, arr):
        n = len(arr)
        size = 1
        while size < n:
            for i in range(0, n, 2*size):
                left = arr[i : i+size]
                right = arr[i+size : i+2*size]
                merged = []
                l = r = 0
                while l < len(left) and r < len(right):
                    if left[l] < right[r]:
                        merged.append(left[l])
                        l += 1
                    else:
                        merged.append(right[r])
                        r += 1
                merged += left[l:]
                merged += right[r:]
                arr[i : i+len(merged)] = merged
            size *= 2
        return arr

In [None]:
# missing value checker

def isNone(list):
    i = 0
    lst_iter = iter(list)
    while True:
        element = next(lst_iter,"end")
        if element == "end":
            break
        if element is None:
            i += 1
    print("The file contains %d None values" % i)

testList = [1,2,3,None,4,5,6,7,8,9,None]

isNone(words1)
isNone(words2)
isNone(words3)
isNone(testList)

In [None]:
dataGenerate = TestDataGenerator()
dataGenerate.setParams(100, 0.6)
SyntheticDataList = dataGenerate.generateData(100000)
SyntheticDataList

In [None]:
SyntheticDataNames = ["Normal", "Long String", "Special String", "Ascending Order", "Descending Order", 
                      "String with missing values", "String with duplicate values"]

In [None]:
synSearchList = []
for datalist in SyntheticDataList[0 : 3]:
    num = round(len(datalist) * 0.3)
    templist = []
    while len(templist) < num:
        tempdata = random.choice(datalist)
        if tempdata not in templist:
            templist.append(tempdata)
    synSearchList.append(templist)

for _ in range(4):
    synSearchList.append(synSearchList[0].copy())
synSearchList

#### Insert Time Taken

In [None]:
def InsertTime2(classname, datalist):
    # Remove Duplicates
    seen = {}
    uniqueDatas = []
    for e in datalist:
        if e not in seen:
            seen[e] = True
            uniqueDatas.append(e)
    
    algorithm = classname()
    if str(classname) == str(BloomFilterSet):
        algorithm.initVariables(len(datalist), 0.01, hashFunctions)

    InsertStart = timeit.default_timer()
    res = []
    for word in uniqueDatas:
        # Remove Missing Values
        if word is not None:
            inserted = algorithm.insertElement(word)
            res.append(inserted)
    InsertEnd = timeit.default_timer()
    InsertTime = InsertEnd - InsertStart
    print("Insert time taken is :", InsertTime)
    return algorithm, res

In [None]:
SEQSyn, BSTSyn, RBTSyn, BTSyn = [], [], [], []
InsertRes1, InsertRes2, InsertRes3, InsertRes4 = [], [], [], []
for i in range(len(SyntheticDataList)):
    print("For", SyntheticDataNames[i], "Case: ")
    print("Sequential Search: ")
    SEQ, res1 = InsertTime2(SequentialSearchSet, SyntheticDataList[i])
    SEQSyn.append(SEQ)
    InsertRes1.append(res1)
#     print("Binary Search Tree: ")
#     BST, res2 = InsertTime2(BinarySearchTreeSet, SyntheticDataList[i])
#     BSTSyn.append(BST)
#     InsertRes2.append(res2)
#     print("Balanced Search Tree: ")
#     RBT, res3 = InsertTime2(BalancedSearchTreeSet, SyntheticDataList[i])
#     RBTSyn.append(RBT)
#     InsertRes3.append(res3)
    print("Bloom Filter: ")
    BT, res4 = InsertTime2(BloomFilterSet, SyntheticDataList[i])
    BTSyn.append(BT)
    InsertRes4.append(res4)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

### Search Time Taken

In [None]:
def SearchTime2(obj, searchdata):
    res = []
    SearchStart = timeit.default_timer()
    for ws in searchdata:
        if ws is not None:
            searched = obj.searchElement(ws)
            res.append(searched)
    SearchEnd = timeit.default_timer()
    SearchTime = SearchEnd - SearchStart
    print("Search time taken is :", SearchTime)
    return res

In [None]:
SearchRes1, SearchRes2, SearchRes3, SearchRes4 = [], [], [], []
for i in range (len(synSearchList)):
    print("For", SyntheticDataNames[i], "Case: ")
    print("Sequential Search: ")
    SEQ, sres1 = SearchTime2(SEQSyn[i], synSearchList[i])
    SearchRes1.append(sres1)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
#     print("Binary Search Tree: ")
#     BST, sres2 = SearchTime2(BSTSyn[i], synSearchList[i])
#     SearchRes2.append(sres2)
#     print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
#     print("Balanced Search Tree: ")
#     RBT, sres3 = SearchTime2(RBTSyn[i], synSearchList[i])
#     SearchRes3.append(sres3)
#     print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Bloom Filter: ")
    BT, sres4 = SearchTime2(BTSyn[i], synSearchList[i])
    SearchRes4.append(sres4)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("                                                 ")

# --------

In [None]:
rbt = BalancedSearchTreeSet()
rbt.insertElement('b')
rbt.insertElement('a')
rbt.insertElement('c')

print(rbt.searchElement('b'))
print(rbt.searchElement('a'))
print(rbt.searchElement('c'))

Sequential Search

In [None]:
# for key in files.keys():
#     seqInsertStart = timeit.default_timer()
#     for word in files[key]:
#         SEQs.insertElement(word)
#     seqInsertEnd = timeit.default_timer()
#     seqInsertTime = seqInsertEnd - seqInsertStart
#     print("Insert time taken for ", key, "is : ", seqInsertTime)

Sequential Search

In [None]:
SeqSearchStart = timeit.default_timer()
for ws in wordsforsearch:
    SEQs.searchElement(ws)
SeqSearchEnd = timeit.default_timer()
SeqSearchTime = SeqSearchEnd - SeqSearchStart
print("Search time taken for 'test-search.txt' is : ", SeqSearchTime)

Binary Search Tree

In [None]:
for key in files.keys():
    binaryInsertStart = timeit.default_timer()
    for word in files[key]:
        BSTs.insertElement(word)
    binaryInsertEnd = timeit.default_timer()
    
    binaryInsertTime = binaryInsertEnd - binaryInsertStart
    print("Insert time taken for ", key, "is : ", binaryInsertTime)

Binary Search Tree

In [None]:
BinarySearchStart = timeit.default_timer()
for ws in wordsforsearch:
    BSTs.searchElement(ws)
BinarySearchEnd = timeit.default_timer()
BinarySearchTime = BinarySearchEnd - BinarySearchStart
print("Search time taken for 'test-search.txt' is : ", BinarySearchTime)

Balanced Search Tree

In [None]:
for key in files.keys():
    RedBlackInsertStart = timeit.default_timer()
    for word in files[key]:
        RBT.insertElement(word)
    RedBlackInsertEnd = timeit.default_timer()
    RedBlackInsertTime = RedBlackInsertEnd - RedBlackInsertStart
    print("Insert time taken for ", key, "is : ", RedBlackInsertTime)

Balanced Search Tree

In [None]:
RedBlackSearchStart = timeit.default_timer()
for ws in wordsforsearch:
    RBT.searchElement(ws)
RedBlackSearchEnd = timeit.default_timer()
RedBlackSearchTime = RedBlackSearchEnd - RedBlackSearchStart
print("Search time taken for 'test-search.txt' is : ", RedBlackSearchTime)

2-3 Balanced Search Tree

In [None]:
for key in files.keys():
    TwoThreeInsertStart = timeit.default_timer()
    for word in files[key]:
        TTBSTs.insertElement(word)
    TwoThreeInsertEnd = timeit.default_timer()
    
    TwoThreeInsertTime = TwoThreeInsertEnd - TwoThreeInsertStart
    print("Insert time taken for ", key, "is : ", TwoThreeInsertTime)

2-3 Balanced Search Tree

In [None]:
TwoThreeSearchStart = timeit.default_timer()
for ws in wordsforsearch:
    TTBSTs.searchElement(ws)
TwoThreeSearchEnd = timeit.default_timer()
TwoThreeSearchTime = TwoThreeSearchEnd - TwoThreeSearchStart
print("Search time taken for 'test-search.txt' is : ", TwoThreeSearchTime)

Bloom Filter

In [None]:
for key in files.keys():
    BFInsertStart = timeit.default_timer()
    for word in files[key]:
        BF.insertElement(word)
    BFInsertEnd = timeit.default_timer()
    BFInsertTime = BFInsertEnd - BFInsertStart
    print("Insert time taken for ", key, "is : ", BFInsertTime)

Bloom Filter

In [None]:
BFSearchStart = timeit.default_timer()
for word in wordsforsearch:
    BF.searchElement(word)
BFSearchEnd = timeit.default_timer()
BFSearchTime = BFSearchEnd - BFSearchStart
print("Search time taken for 'test-search.txt' is : ", BFSearchTime)

In [None]:
a = './data/testfiles/test1-mobydick.txt'
b = './data/testfiles/test2-warpeace.txt'
c = './data/testfiles/test3-dickens.txt'
insertFiles = [a,b,c]

for file in insertFiles:
    seqInsertStart = timeit.default_timer()
    seqS.insertElement(file)
    seqInsertEnd = timeit.default_timer()
    seqInsertTime = seqInsertEnd - seqInsertStart
    print(seqInsertTime)

In [None]:
a = './data/testfiles/test1-mobydick.txt'
b = './data/testfiles/test2-warpeace.txt'
c = './data/testfiles/test3-dickens.txt'
d = "./data/testfiles/test-search.txt"
insertFiles = [a,b,c]
for file in insertFiles:
    seqS.insertElement(file)
    SeqSearchStart = timeit.default_timer()
    seqS.searchElement(d)
    SeqSearchEnd = timeit.default_timer()
    SeqSearchTime = SeqSearchEnd - SeqSearchStart
    print(SeqSearchTime)

In [None]:
a = './data/testfiles/test1-mobydick.txt'
b = './data/testfiles/test2-warpeace.txt'
c = './data/testfiles/test3-dickens.txt'
insertFiles = [a,b,c]

for file in insertFiles:
    BSTInsertStart = timeit.default_timer()
    rbt.insertElement(file)
    BSTInsertEnd = timeit.default_timer()
    BSTInsertTime = BSTInsertEnd - BSTInsertStart
    print(BSTInsertTime)


In [None]:
def insert_file():
    a = './data/testfiles/test1-mobydick.txt'
    b = './data/testfiles/test2-warpeace.txt'
    c = './data/testfiles/test3-dickens.txt'
    insertFiles = [a,b,c]
    for file in insertFiles:
        BSTInsertStart = timeit.default_timer()
        bst.insertElement(file)
        BSTInsertEnd = timeit.default_timer()
        BSTInsertTime = BSTInsertEnd - BSTInsertStart
        print(BSTInsertTime)

def insert_real_data_test(words, bst):
    BSTInsertStart = timeit.default_timer() 
    for word in words:
        bst.insertElement(word)
    BSTInsertEnd = timeit.default_timer()
    BSTInsertTime = BSTInsertEnd - BSTInsertStart
    print("Insert time: ", BSTInsertTime)

def search_words_in_set(words, bst):
    BSTSearchStart = timeit.default_timer() 
    for word in words:
        bst.searchElement(word)
    BSTSearchEnd = timeit.default_timer()
    BSTSearchTime = BSTSearchEnd - BSTSearchStart
    print("Search time (word already in set): ", BSTSearchTime)


def search_words(words, bst):
    BSTSearchStart = timeit.default_timer() 
    for word in words:
        bst.searchElement(word)
    BSTSearchEnd = timeit.default_timer()
    BSTSearchTime = BSTSearchEnd - BSTSearchStart
    print("Search time (test-search words): ", BSTSearchTime)


bst1 = BinarySearchTreeSet()
bst2 = BinarySearchTreeSet()
bst3 = BinarySearchTreeSet()

insert_real_data_test(words, bst1)
insert_real_data_test(words2, bst2)
insert_real_data_test(words3, bst3)

search_words_in_set(words, bst1)
search_words_in_set(words2, bst2)
search_words_in_set(words3, bst3)

search_words(words, bst1)
search_words(words2, bst2)
search_words(words3, bst3)

In [None]:
a = './data/testfiles/test1-mobydick.txt'
b = './data/testfiles/test2-warpeace.txt'
c = './data/testfiles/test3-dickens.txt'
d = "./data/testfiles/test-search.txt"
insertFiles = [a,b,c]

for file in insertFiles:
    rbt.insertElement(file)
    BSTSearchStart = timeit.default_timer()
    rbt.searchElement(d)
    BSTSearchEnd = timeit.default_timer()
    BSTSearchTime = BSTSearchEnd - BSTSearchStart
    print(BSTSearchTime)

In [None]:
# 1st version

class TestDataGenerator():

    def __init__(self):
            self.syntheticData = []
            
            pass           
            
    # sythetic data -- integers:
    def generateNum(self, size):
        for _ in range(size):
            item = random.randrange(0,1000000)
            self.syntheticData.append(item)
        print(self.syntheticData) # delete when it's finished
        return self.syntheticData   
    
    # sythetic data -- 4-letter string (randomly without real meanings):
    def generateString(self,size):
        letters = string.ascii_lowercase
        for letter in letters:
            for _ in range(size):
                letter1 = ''.join(random.choice(letters))
                letter2 = ''.join(random.choice(letters))
                letter3 = ''.join(random.choice(letters))
                stringGenerated = letter + letter1 + letter2 + letter3
                self.syntheticData.append(stringGenerated)
        print(self.syntheticData) # delete when it's finished
        return self.syntheticData

toGenerate = TestDataGenerator()
toSearch = TestDataGenerator()

toGenerate.generateString(1)
toSearch.generateNum(10)