In [46]:
import random

# Generate the datasets for the relations R1, R2, and R3
R1 = [(i, random.randint(1, 5000)) for i in range(1, 101)]
R2 = [(random.randint(1, 5000), j) for j in range(1, 101)]
R3 = [(l, l) for l in range(1, 101)]
R = [R1, R2, R3]


## Result with Algorithm from Q2

In [47]:
class K_Line_Joins:
    def semijoin(self, relation1, relation2, key1, key2):
        """ Reduce relation1 by performing a semijoin with relation2 on specified keys. """
        filter_set = {t[key2] for t in relation2}
        return [t for t in relation1 if t[key1] in filter_set]

    def join(self, relation1, relation2, key1, key2):
        """ Join two relations on specified keys, handling duplicates appropriately. """
        index = {}

        # Create an index for the second relation on the join key
        for t in relation2:
            if t[key2] in index:
                index[t[key2]].append(t)
            else:
                index[t[key2]] = [t]
        # print(index)

        # Perform the join
        result = []
        for t1 in relation1:
            if t1[key1] in index:
                for t2 in index[t1[key1]]:
                    # Concatenate tuples while removing the repeated join attribute from the second tuple
                    result.append(t1 + t2[1:])  # Skip the join attribute of the second tuple

        return result

    def k_line_join(self, relations):
        """ Perform a k-line join on a list of relations using the specified keys. """
        if not relations:
            return []

        # Start with the first relation
        result = relations[0]

        # Perform successive joins
        for i in range(1, len(relations)):
            result = self.join(result, relations[i], i, 0)
            # print(result)

        return result
    def process_and_join(self, R):
        R_reduced = []
        for i in range(len(R) - 1):
            reduced = self.semijoin(R[i], R[i+1], 1, 0)
            R_reduced.append(reduced)
        R_reduced.append(R[-1])
        final_result = self.k_line_join(R_reduced)
        return final_result

In [48]:
k_line_joiner = K_Line_Joins()
final_result2 = k_line_joiner.process_and_join(R)
print("Final Result:")
for line in final_result2:
    print(line)

Final Result:
(29, 2513, 51, 51)
(54, 1092, 63, 63)
(63, 2323, 52, 52)
(78, 220, 20, 20)


## Result with Algorithm in Q3

In [49]:
class Chain_Joins:
    def hash_join(self, R1, R2):
        hash_table = {}
        # Building hash table for R2
        for tup in R2:
            key = tup[0]
            if key in hash_table:
                hash_table[key].append(tup)
            else:
                hash_table[key] = [tup]

        result = []
        # Perform the join
        for tup in R1:
            key = tup[-1]
            if key in hash_table:
                for r2_tup in hash_table[key]:
                    result.append(tup + (r2_tup[-1],))
        return result
    def chain_join(self, relations):
        current_result = relations[0]
        for R in relations[1:]:
            current_result = self.hash_join(current_result, R)
        return current_result

In [50]:
chain_joiner = Chain_Joins()
final_result3 = chain_joiner.chain_join(R)
print("Final Result:")
for line in final_result3:
    print(line)

Final Result:
(29, 2513, 51, 51)
(54, 1092, 63, 63)
(63, 2323, 52, 52)
(78, 220, 20, 20)


In [51]:
if final_result2 == final_result3:
    print("The results are the same.")
else:
    print("The results are not the same.")


The results are the same.


 the algorithm from Problem 2 (K_Line_Joins) has a better time complexity of O(n + OUT), while the algorithm from Problem 3 (Chain_Joins) has a higher time complexity of O(k * n^2), where k is the number of relations.