In [1]:
import random

# Generating dataset for R1
R1 = [(i, 5) for i in range(1, 1001)] + [(i, 7) for i in range(1001, 2001)]
R1.append((2001, 2002))
random.shuffle(R1)  # Random permutation of tuples in R1

# Generating dataset for R2
R2 = [(5, i) for i in range(1, 1001)] + [(7, i) for i in range(1001, 2001)]
R2.append((2002, 8))
random.shuffle(R2)  # Random permutation of tuples in R2

# Generating dataset for R3
R3 = [(random.randint(2002, 3000), random.randint(1, 3000)) for _ in range(2000)]
R3.append((8, 30))
random.shuffle(R3)  # Random permutation of tuples in R3

R = [R1, R2, R3]


## Result with Algorithm from Q2

In [2]:
class K_Line_Joins:
    def semijoin(self, relation1, relation2, key1, key2):
        """ Reduce relation1 by performing a semijoin with relation2 on specified keys. """
        filter_set = {t[key2] for t in relation2}
        return [t for t in relation1 if t[key1] in filter_set]

    def join(self, relation1, relation2, key1, key2):
        """ Join two relations on specified keys, handling duplicates appropriately. """
        index = {}

        # Create an index for the second relation on the join key
        for t in relation2:
            if t[key2] in index:
                index[t[key2]].append(t)
            else:
                index[t[key2]] = [t]
        # print(index)

        # Perform the join
        result = []
        for t1 in relation1:
            if t1[key1] in index:
                for t2 in index[t1[key1]]:
                    # Concatenate tuples while removing the repeated join attribute from the second tuple
                    result.append(t1 + t2[1:])  # Skip the join attribute of the second tuple

        return result

    def k_line_join(self, relations):
        """ Perform a k-line join on a list of relations using the specified keys. """
        if not relations:
            return []

        # Start with the first relation
        result = relations[0]

        # Perform successive joins
        for i in range(1, len(relations)):
            result = self.join(result, relations[i], i, 0)
            # print(result)

        return result
    def process_and_join(self, R):
        R_reduced = []
        for i in range(len(R) - 1):
            reduced = self.semijoin(R[i], R[i+1], 1, 0)
            R_reduced.append(reduced)
        R_reduced.append(R[-1])
        final_result = self.k_line_join(R_reduced)
        return final_result

In [3]:
import cProfile
import pstats


In [4]:
def run_line_joins():
    k_line_joiner = K_Line_Joins()
    final_result2 = k_line_joiner.process_and_join(R)
    return final_result2

profile = cProfile.Profile()
profile.runcall(run_line_joins)
stats = pstats.Stats(profile)
stats.sort_stats('cumulative').print_stats(10)

         3155 function calls in 0.002 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.002    0.002 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/2761201456.py:1(run_line_joins)
        1    0.000    0.000    0.002    0.002 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/738718309.py:43(process_and_join)
        1    0.000    0.000    0.001    0.001 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/738718309.py:29(k_line_join)
        2    0.001    0.000    0.001    0.001 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/738718309.py:7(join)
        2    0.000    0.000    0.000    0.000 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/738718309.py:2(semijoin)
        2    0.000    0.000    0.000    0.000 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/738718309.py:5(<listcomp>)
        2    0.

<pstats.Stats at 0x1104740d0>

In [5]:
final_result2 = run_line_joins()
print("Final Result:")
for line in final_result2:
    print(line)

Final Result:
(898, 5, 8, 30)
(81, 5, 8, 30)
(995, 5, 8, 30)
(751, 5, 8, 30)
(494, 5, 8, 30)
(544, 5, 8, 30)
(225, 5, 8, 30)
(125, 5, 8, 30)
(990, 5, 8, 30)
(520, 5, 8, 30)
(308, 5, 8, 30)
(650, 5, 8, 30)
(805, 5, 8, 30)
(781, 5, 8, 30)
(408, 5, 8, 30)
(601, 5, 8, 30)
(964, 5, 8, 30)
(975, 5, 8, 30)
(240, 5, 8, 30)
(963, 5, 8, 30)
(411, 5, 8, 30)
(968, 5, 8, 30)
(499, 5, 8, 30)
(832, 5, 8, 30)
(36, 5, 8, 30)
(273, 5, 8, 30)
(175, 5, 8, 30)
(477, 5, 8, 30)
(231, 5, 8, 30)
(168, 5, 8, 30)
(770, 5, 8, 30)
(484, 5, 8, 30)
(893, 5, 8, 30)
(486, 5, 8, 30)
(884, 5, 8, 30)
(844, 5, 8, 30)
(259, 5, 8, 30)
(579, 5, 8, 30)
(547, 5, 8, 30)
(635, 5, 8, 30)
(933, 5, 8, 30)
(888, 5, 8, 30)
(277, 5, 8, 30)
(130, 5, 8, 30)
(343, 5, 8, 30)
(394, 5, 8, 30)
(430, 5, 8, 30)
(85, 5, 8, 30)
(101, 5, 8, 30)
(305, 5, 8, 30)
(632, 5, 8, 30)
(950, 5, 8, 30)
(229, 5, 8, 30)
(320, 5, 8, 30)
(143, 5, 8, 30)
(326, 5, 8, 30)
(167, 5, 8, 30)
(561, 5, 8, 30)
(249, 5, 8, 30)
(537, 5, 8, 30)
(603, 5, 8, 30)
(667, 5, 8, 3

## Result with Algorithm in Q3

In [6]:
class Chain_Joins:
    def hash_join(self, R1, R2):
        hash_table = {}
        # Building hash table for R2
        for tup in R2:
            key = tup[0]
            if key in hash_table:
                hash_table[key].append(tup)
            else:
                hash_table[key] = [tup]

        result = []
        # Perform the join
        for tup in R1:
            key = tup[-1]
            if key in hash_table:
                for r2_tup in hash_table[key]:
                    result.append(tup + (r2_tup[-1],))
        return result
    def chain_join(self, relations):
        current_result = relations[0]
        for R in relations[1:]:
            current_result = self.hash_join(current_result, R)
        return current_result

In [7]:
def run_chain_joins():
    chain_joiner = Chain_Joins()
    final_result3 = chain_joiner.chain_join(R)
    return final_result3

profile = cProfile.Profile()
profile.runcall(run_chain_joins)
stats = pstats.Stats(profile)
stats.sort_stats('cumulative').print_stats(10)

         2004141 function calls in 0.513 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.513    0.513 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/292647011.py:1(run_chain_joins)
        1    0.021    0.021    0.513    0.513 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/2912444394.py:20(chain_join)
        2    0.421    0.211    0.492    0.246 /var/folders/qs/cgg92xbs5dz6hzkr23vrjgc80000gn/T/ipykernel_19065/2912444394.py:2(hash_join)
  2004136    0.071    0.000    0.071    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}




<pstats.Stats at 0x1100cdad0>

In [8]:
final_result3 = run_chain_joins()
print("Final Result:")
for line in final_result3:
    print(line)

Final Result:
(898, 5, 8, 30)
(81, 5, 8, 30)
(995, 5, 8, 30)
(751, 5, 8, 30)
(494, 5, 8, 30)
(544, 5, 8, 30)
(225, 5, 8, 30)
(125, 5, 8, 30)
(990, 5, 8, 30)
(520, 5, 8, 30)
(308, 5, 8, 30)
(650, 5, 8, 30)
(805, 5, 8, 30)
(781, 5, 8, 30)
(408, 5, 8, 30)
(601, 5, 8, 30)
(964, 5, 8, 30)
(975, 5, 8, 30)
(240, 5, 8, 30)
(963, 5, 8, 30)
(411, 5, 8, 30)
(968, 5, 8, 30)
(499, 5, 8, 30)
(832, 5, 8, 30)
(36, 5, 8, 30)
(273, 5, 8, 30)
(175, 5, 8, 30)
(477, 5, 8, 30)
(231, 5, 8, 30)
(168, 5, 8, 30)
(770, 5, 8, 30)
(484, 5, 8, 30)
(893, 5, 8, 30)
(486, 5, 8, 30)
(884, 5, 8, 30)
(844, 5, 8, 30)
(259, 5, 8, 30)
(579, 5, 8, 30)
(547, 5, 8, 30)
(635, 5, 8, 30)
(933, 5, 8, 30)
(888, 5, 8, 30)
(277, 5, 8, 30)
(130, 5, 8, 30)
(343, 5, 8, 30)
(394, 5, 8, 30)
(430, 5, 8, 30)
(85, 5, 8, 30)
(101, 5, 8, 30)
(305, 5, 8, 30)
(632, 5, 8, 30)
(950, 5, 8, 30)
(229, 5, 8, 30)
(320, 5, 8, 30)
(143, 5, 8, 30)
(326, 5, 8, 30)
(167, 5, 8, 30)
(561, 5, 8, 30)
(249, 5, 8, 30)
(537, 5, 8, 30)
(603, 5, 8, 30)
(667, 5, 8, 3

In [9]:
if final_result2 == final_result3:
    print("The results are the same.")
else:
    print("The results are not the same.")


The results are the same.


 the algorithm from Problem 2 (K_Line_Joins) has a better time complexity of O(n + OUT), while the algorithm from Problem 3 (Chain_Joins) has a higher time complexity of O(k * n^2), where k is the number of relations.