### Necessary Imports

In [1]:
import hashlib
import csv
import random
import time
import threading

### Function to Generate Input File Structure

The function generates an input file with the following structure:

- Each line starts with a user ID in the format `User_i`, where `i` is a random number ranging from 1 to 1 million.
- Following each user ID, there is a hash value of random number ranging from 1000 to 10000.

Here's an example section of the generated file: 
```
User_392253,74ed65a2d22a92c3b4e013c15ff04d05f4954cd792d9cf56e9a0edad5f914ed1
User_852766,5901aee4bc888df51bde5904a4c56d0a68536fb5157e19001973286ceed51354
User_513257,abbfc2b6da87b49139e8a13ce2ebf510818cfa6bc42e8cec990d36235dbb99bc
User_646094,c8ace20a55c88e4d1fc94009b763c6690efa764f5e6497cc736acf069b1fbc82
```

In [2]:
NUM_LINES = 1_000_000
MIN_VALUE_TO_HASH = 1
MAX_VALUE_TO_HASH = 1_000_000

In [3]:
def generate_input_file(file_name:str, num_lines=100, min_value_to_hash=1000, max_value_to_hash=10_000):
    with open(file_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Name', 'Hash Value'])  # set the first row of csv file which contains the heads of the colums
        for _ in range(num_lines):
            name = f'User_{random.randint(1, 1_000_000)}'
            num = random.randint(min_value_to_hash, max_value_to_hash)
            hash_value = hashlib.sha256(str(num).encode()).hexdigest()
            writer.writerow([name, hash_value])

# create source file
generate_input_file('source.csv', NUM_LINES, MIN_VALUE_TO_HASH, MAX_VALUE_TO_HASH)

### Function to break the hash values through a rainbow hacking method

In [5]:
def rainbow_password_hack(hashed_dict, input_file_name, output_file_name):
    # create dictionary with the structreu: {hash of a given numeber : number being hashed}
    # global hashed_dict
    
    for num in range(MIN_VALUE_TO_HASH, MAX_VALUE_TO_HASH + 1):
        # convert the number to a string and compute its hash value using sha256 algorithm.
        hash_value = hashlib.sha256(str(num).encode()).hexdigest()
        hashed_dict[hash_value] = num
        
    with open(input_file_name, 'r') as input_file:
        next(input_file)  # skip the first line being the header of the file
        lines = input_file.readlines()
        
    with open(output_file_name, 'w') as output_file:
        for line in lines:
            name, hash_value = line.strip().split(',')
            output_file.write(f"{name},{hashed_dict[hash_value]}\n")

In [12]:
hashed_dict = {}
start_time = time.time()
rainbow_password_hack(hashed_dict, 'source.csv', 'result.csv')
end_time = time.time()
print(f'It took {end_time - start_time} seconds to finish the task without multithreading')

It took 1.9815020561218262 seconds to finish the task without multithreading


### Using Multithreading and Batch Proccesing

In [8]:
import threading
import hashlib

def rainbow_password_hack_multithreaded(hashed_dict, input_file_name, output_file_name):
    # Function to process a batch of lines
    def process_batch(lines):
        for line in lines:
            name, hash_value = line.strip().split(',')
            output_file.write(f"{name},{hashed_dict[hash_value]}\n")
    
    # Create a lock to synchronize access to the output file
    output_lock = threading.Lock()
    
    for num in range(MIN_VALUE_TO_HASH, MAX_VALUE_TO_HASH + 1):
        # convert the number to a string and compute its hash value using sha256 algorithm.
        hash_value = hashlib.sha256(str(num).encode()).hexdigest()
        hashed_dict[hash_value] = num
        
    with open(input_file_name, 'r') as input_file, open(output_file_name, 'w') as output_file:
        next(input_file)  # skip the first line being the header of the file
        lines = input_file.readlines()
        
        # Determine the batch size
        batch_size = 100  # Adjust this value to find an optimal batch size
        
        # Create a list to hold the thread objects
        threads = []
        
        for i in range(0, len(lines), batch_size):
            # Create a thread for each batch of lines
            batch = lines[i:i+batch_size]
            t = threading.Thread(target=process_batch, args=(batch,))
            threads.append(t)
            t.start()
        
        # Wait for all threads to finish
        for t in threads:
            t.join()

In [9]:
hashed_dict_2 = {}
start_time = time.time()
rainbow_password_hack_multithreaded(hashed_dict_2, 'source.csv', 'result.csv')
end_time = time.time()
print(f'It took {end_time - start_time} seconds to finish the task with multithreading')

It took 2.671334981918335 seconds to finish the task with multithreading


### Parallel Hashing

In [18]:
import concurrent.futures
import hashlib

def rainbow_password_hack_parallel(hashed_dict, input_file_name, output_file_name, num_threads):
    for num in range(MIN_VALUE_TO_HASH, MAX_VALUE_TO_HASH + 1):
        hash_value = hashlib.sha256(str(num).encode()).hexdigest()
        hashed_dict[hash_value] = num

    with open(input_file_name, 'r') as input_file, open(output_file_name, 'w') as output_file:
        next(input_file)  # skip the first line being the header of the file
        lines = input_file.readlines()

        batch_size = 100
        num_lines = len(lines)
        num_batches = (num_lines + batch_size - 1) // batch_size

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Submit tasks to the executor for parallel processing
            futures = []
            for i in range(0, num_lines, batch_size):
                batch = lines[i:i + batch_size]
                future = executor.submit(process_batch, batch, hashed_dict, output_file_name)
                futures.append(future)

            # Wait for all tasks to complete
            for future in concurrent.futures.as_completed(futures):
                # Retrieve the result of each task (if needed)
                result = future.result()

        # All tasks have completed at this point
        print("All tasks completed!")

def process_batch(lines, hashed_dict, output_file_name):
    with open(output_file_name, 'a') as output_file:
        for line in lines:
            name, hash_value = line.strip().split(',')
            output_file.write(f"{name},{hashed_dict[hash_value]}\n")

In [20]:
hashed_dict_3 = {}
start_time = time.time()
rainbow_password_hack_parallel(hashed_dict_3, 'source.csv', 'result.csv', 4)
end_time = time.time()
print(f'It took {end_time - start_time} seconds to finish the task with multithreading')

All tasks completed!
It took 2.7446000576019287 seconds to finish the task with multithreading
