In [3]:
import pandas as pd
import os
import threading
from queue import Queue
import time

BASE_PATH = '../../../data/'
activity_log_path = os.path.join(BASE_PATH, 'ACTIVITY_LOG.csv')
user_log_path = os.path.join(BASE_PATH, 'USER_LOG.csv')

# Non-threaded version
def load_data_non_threaded(file_path):
    try:
        data = pd.read_csv(file_path)
        return data
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def measure_non_threaded():
    start_time = time.time()
    load_data_non_threaded(activity_log_path)
    load_data_non_threaded(user_log_path)
    end_time = time.time()
    return end_time - start_time

# Threaded version
def load_data_threaded(file_path, output_queue):
    try:
        data = pd.read_csv(file_path)
        output_queue.put(data)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        output_queue.put(None)

def measure_threaded():
    start_time = time.time()
    queue = Queue()
    threads = []
    threads.append(threading.Thread(target=load_data_threaded, args=(activity_log_path, queue)))
    threads.append(threading.Thread(target=load_data_threaded, args=(user_log_path, queue)))

    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

    while not queue.empty():
        queue.get()
    end_time = time.time()
    return end_time - start_time

# Main execution
non_threaded_time = measure_non_threaded()
threaded_time = measure_threaded()

# Results
print(f"Non-threaded loading time: {non_threaded_time:.6f} seconds")
print(f"Threaded loading time: {threaded_time:.6f} seconds")

Non-threaded loading time: 0.126739 seconds
Threaded loading time: 0.079921 seconds


In [9]:
import pandas as pd
import threading
from queue import Queue
import time
import os

BASE_PATH = '../../../data/'
activity_log_path = os.path.join(BASE_PATH, 'ACTIVITY_LOG.csv')
user_log_path = os.path.join(BASE_PATH, 'USER_LOG.csv')

def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        return data
    except Exception as e:
        return None

def merge_data(activity_log, user_log):
    if activity_log is not None and user_log is not None:
        merged = pd.merge(activity_log, user_log, on='User Full Name *Anonymized', how='inner')
        return merged
    else:
        print("Error: One or more data frames are None, cannot proceed with merge.")
        return None

def measure_non_threaded_merge():
    activity_log = load_data(activity_log_path)
    user_log = load_data(user_log_path)
    start_time = time.time()
    merge_data(activity_log, user_log)
    end_time = time.time()
    return end_time - start_time

# Threaded version
    try:
        data = pd.read_csv(file_path)
        output_queue.put(data)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        output_queue.put(None)

def measure_threaded_merge():
    queue = Queue()
    threads = []
    threads.append(threading.Thread(target=load_data_threaded, args=(activity_log_path, queue)))
    threads.append(threading.Thread(target=load_data_threaded, args=(user_log_path, queue)))
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
    activity_log = queue.get()
    user_log = queue.get()

    start_time = time.time()
    merge_data(activity_log, user_log)
    end_time = time.time()
    return end_time - start_time


# MAIN
def main():
    non_threaded_time = measure_non_threaded_merge()
    threaded_time = measure_threaded_merge()
    print("\n--- Results ---")
    print(f"Non-threaded merge time: {non_threaded_time:.6f} seconds")
    print(f"Threaded merge time: {threaded_time:.6f} seconds")


main()

Loaded data from ../../../data/ACTIVITY_LOG.csvLoaded data from ../../../data/USER_LOG.csv


--- Results ---
Non-threaded merge time: 45.338281 seconds
Threaded merge time: 42.648805 seconds
