In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
from k_popular_words import KMostPopularWords
from single_task import KMostSingle
from utils import ExperimentRunner
import pandas as pd

In [3]:
files = ["data/data_300MB.txt", "data/data_2.5GB.txt", "data/data_16GB.txt"]
experiment_runner = ExperimentRunner()

k = 10

In [4]:
result = []

# Case 1: 
- Run as single task (without multi-thread and chunking)
- Sorting Algorithm: python sort (Tim Sort)

In [5]:
for file_path in files:
    try: 
        k_most = KMostPopularWords(file_path)
        baseline_result = experiment_runner.run_experiment("Baseline"+file_path, k_most.get_top_k_words_baseline, k)
        print(baseline_result)
        result.append(["1", file_path, experiment_runner.wall_time, experiment_runner.cpu_time, experiment_runner.peak_memory_usage/ 1024 / 1024])
    except ValueError:
        result.append(["1", file_path, None, None, None])


Running experiment: Baselinedata/data_300MB.txt
Wall Time: 32.92 seconds
CPU Time: 32.83 seconds
Peak Memory Usage: 4375.38 MB
[('european', 318532), ('mr', 210638), ('would', 181905), ('also', 180117), ('commission', 172768), ('must', 156850), ('president', 152132), ('union', 130292), ('states', 129472), ('member', 126221)]
Running experiment: Baselinedata/data_2.5GB.txt
Running experiment: Baselinedata/data_16GB.txt


In [6]:
result

[['1',
  'data/data_300MB.txt',
  32.917779207229614,
  32.82740000000001,
  4375.37620639801],
 ['1', 'data/data_2.5GB.txt', None, None, None],
 ['1', 'data/data_16GB.txt', None, None, None]]

# Case 2: 
- Run as single task (without multi-thread and chunking
- Sorting Algorithm: python heapq nlargest (Heap Sort)

In [7]:
for file_path in files:
    try: 
        k_most = KMostPopularWords(file_path, "heap_sort")
        baseline_result = experiment_runner.run_experiment("Baseline"+file_path, k_most.get_top_k_words_baseline, k)
        print(baseline_result)
        result.append(["2", file_path, experiment_runner.wall_time, experiment_runner.cpu_time, experiment_runner.peak_memory_usage/ 1024 / 1024])
    except ValueError:
        result.append(["2", file_path, None, None, None])

Running experiment: Baselinedata/data_300MB.txt
Wall Time: 32.70 seconds
CPU Time: 32.63 seconds
Peak Memory Usage: 4375.47 MB
[('european', 318532), ('mr', 210638), ('would', 181905), ('also', 180117), ('commission', 172768), ('must', 156850), ('president', 152132), ('union', 130292), ('states', 129472), ('member', 126221)]
Running experiment: Baselinedata/data_2.5GB.txt
Running experiment: Baselinedata/data_16GB.txt


In [11]:
pd.DataFrame(result).to_csv('result.csv')
result

[['1',
  'data/data_300MB.txt',
  32.917779207229614,
  32.82740000000001,
  4375.37620639801],
 ['1', 'data/data_2.5GB.txt', None, None, None],
 ['1', 'data/data_16GB.txt', None, None, None],
 ['2',
  'data/data_300MB.txt',
  32.70103096961975,
  32.62513099999995,
  4375.474266052246],
 ['2', 'data/data_2.5GB.txt', None, None, None],
 ['2', 'data/data_16GB.txt', None, None, None]]

# Case 3:
- Run as multi-tasks (with chunking)
- Sorting Algorithm python sort (Tim Sort)
- Different chunk sizes

In [12]:
# test 1
chunk_sizes = [400*1024*1024, 200*1024*1024, 100*1024*1024]

In [None]:
for i in range(3):
    chunk_size = chunk_sizes[i]
    for file_path in files:
        try: 
            k_most = KMostPopularWords(file_path)
            baseline_result = experiment_runner.run_experiment("Chunking-Tim" + file_path, k_most.get_top_k_words_chunk, k, chunk_size)
            print(baseline_result)
            result.append(["3-" + str(i), file_path, experiment_runner.wall_time, experiment_runner.cpu_time, experiment_runner.peak_memory_usage/ 1024 / 1024])
        except ValueError:
            result.append(["3-" + str(i), file_path, None, None, None])

Running experiment: Chunking-Timdata/data_300MB.txt
Wall Time: 33.72 seconds
CPU Time: 33.61 seconds
Peak Memory Usage: 5021.16 MB
[('european', 318532), ('mr', 210638), ('would', 181905), ('also', 180117), ('commission', 172768), ('must', 156850), ('president', 152132), ('union', 130292), ('states', 129472), ('member', 126221)]
Running experiment: Chunking-Timdata/data_2.5GB.txt
Wall Time: 307.15 seconds
CPU Time: 296.66 seconds
Peak Memory Usage: 6635.57 MB
[('said', 2616235), ('one', 949399), ('would', 917205), ('new', 852788), ('also', 727930), ('last', 700206), ('people', 688742), ('mr', 659104), ('us', 643771), ('de', 643048)]
Running experiment: Chunking-Timdata/data_16GB.txt
Wall Time: 2368.40 seconds
CPU Time: 2004.32 seconds
Peak Memory Usage: 11324.80 MB
[('said', 16980655), ('would', 5822354), ('one', 5794014), ('new', 5609010), ('also', 4616671), ('us', 4496155), ('people', 4255373), ('last', 4089050), ('two', 3956855), ('first', 3832029)]
Running experiment: Chunking-Timd

In [None]:
pd.DataFrame(result).to_csv('result.csv')
result

# Case 4
- Run as multi-tasks (witt chunking)
- Sorting Algorithm: python heapq nlargest (Heap Sort)

In [None]:
chunk_size = 200*1024*1024

In [None]:
for file_path in files:
    try: 
        k_most = KMostPopularWords(file_path, "heap_sort")
        baseline_result = experiment_runner.run_experiment("Chunking-heap" + file_path, k_most.get_top_k_words_chunk, k, chunk_size)
        print(baseline_result)
        result.append(["4", file_path, experiment_runner.wall_time, experiment_runner.cpu_time, experiment_runner.peak_memory_usage/ 1024 / 1024])
    except ValueError:
        result.append(["4", file_path, None, None, None])

In [None]:
pd.DataFrame(result).to_csv('result.csv')
result

# Case 5
- Run as multi-tasks (with multi-threads and chunking)
- Sorting Algorithm: python sort (Tim Sort)

In [None]:
chunk_size = 200*1024*1024
num_thread = 8

In [None]:
for file_path in files:
    try: 
        k_most = KMostPopularWords(file_path)
        baseline_result = experiment_runner.run_experiment("multi-thread" + file_path, k_most.get_top_k_words_chunk_mt, k, num_thread, chunk_size)
        print(baseline_result)
        result.append(["5", file_path, experiment_runner.wall_time, experiment_runner.cpu_time, experiment_runner.peak_memory_usage/ 1024 / 1024])
    except ValueError:
        result.append(["5", file_path, None, None, None])

In [None]:
pd.DataFrame(result).to_csv('result.csv')
result

# Case 6
- Run as multi-tasks (with multi-process and chunking)
- Sorting Algorithm: python sort (Tim Sort)

In [None]:
chunk_size = 200*1024*1024
num_process = 8

In [None]:
for file_path in files:
    try: 
        k_most = KMostPopularWords(file_path)
        baseline_result = experiment_runner.run_experiment("multi-thread" + file_path, k_most.get_top_k_words_chunk_mp, k, num_process, chunk_size)
        print(baseline_result)
        result.append(["6", file_path, experiment_runner.wall_time, experiment_runner.cpu_time, experiment_runner.peak_memory_usage/ 1024 / 1024])
    except ValueError:
        result.append(["6", file_path, None, None, None])

In [None]:
pd.DataFrame(result).to_csv('result.csv')
result

# Summary

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# log Y axis
def result_plot(df, title, y_title):
    x = np.log(df['DataSize'])
    plt.plot(x, np.log(df['Case1']), marker='o', label='Case 1')
    plt.plot(x, np.log(df['Case2']), marker='o', label='Case 2')
    plt.plot(x, np.log(df['Case3_1']), marker='o', label='Case 3')
    
    plt.title(title + ' vs Data Size for different algorithms')
    plt.xlabel('Log Data Size in MB')
    plt.ylabel(y_title)
    plt.legend()

    # Display the plot
    plt.show()

# without log Y axis
def result_plot(df, title, y_title):
    x = np.log(df['DataSize'])
    plt.plot(x, df['Case1'], marker='o', label='Case 1')
    plt.plot(x, df['Case2'], marker='o', label='Case 2')
    plt.plot(x, df['Case3'], marker='o', label='Case 3')
    
    plt.title(title + ' vs Data Size for different algorithms')
    plt.xlabel('Log Data Size in MB')
    plt.ylabel(y_title)
    plt.legend()

    # Display the plot
    plt.show()

In [None]:
result3_1

## Speed

In [None]:
speed = {'data/data_300MB.txt': [300],
 'data/data_2.5GB.txt': [2.5*1024]
 # 'data/data_16GB.txt': [16*1024]
        }
results = [result1, result2, result3_1]
for result in results:
    for key, value in result.items():
        speed.setdefault(key, []).append(value[0])
df_speed = pd.DataFrame(speed.values(), columns = ['DataSize', 'Case1', 'Case2', 'Case3_1'])
df_speed

In [None]:
result_plot(df_speed, 'Runtime', 'Log Runtime in Second')

## CPU utilization

In [None]:
cpu = {'data/data_300MB.txt': [300],
 'data/data_2.5GB.txt': [2.5*1024]
 # 'data/data_16GB.txt': [16*1024]
      }
for result in results:
    for key, value in result.items():
        cpu.setdefault(key, []).append(value[2])
df_cpu = pd.DataFrame(cpu.values(), columns = ['DataSize', 'Case1', 'Case2', 'Case3_1'])
df_cpu

## Ram Utilization

In [None]:
ram = {'data/data_300MB.txt': [300],
 'data/data_2.5GB.txt': [2.5*1024],
 'data/data_16GB.txt': [16*1024]}
for result in results:
    for key, value in result.items():
        ram.setdefault(key, []).append(value[2])
        
        

In [None]:
df_ram = pd.DataFrame(ram.values(), columns = ['DataSize', 'Case1', 'Case2', 'Case3_1', 'Case3_2', 'Case4'])
df_ram.to_csv('ram.csv')

In [None]:
result_plot(df_ram, 'Ram Usage', 'Log Ram Usage in MB')