In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from k_popular_words import KMostPopularWords
from single_task import KMostSingle
from utils import ExperimentRunner

In [3]:
files = ["data/data_300MB.txt", "data/data_2.5GB.txt", "data/data_16GB.txt"]
experiment_runner = ExperimentRunner()

k = 10
chunk_size = 400*1024*1024
num_threads = 8


# Case 1: 
- Run as single task (without multi-thread and chunking
- Sorting Algorithm: python sort (Tim Sort)

In [4]:
result1 = {}

In [5]:
for file_path in files:
    try: 
        k_most = KMostPopularWords(file_path)
        baseline_result = experiment_runner.run_experiment("Baseline"+file_path, k_most.get_top_k_words_baseline, k)
        print(experiment_runner.measure)
        print(baseline_result)
        result1[file_path] = experiment_runner.measure
    except ValueError:
        result1[file_path] = []


Running experiment: Baselinedata/data_300MB.txt
Runtime: 32.20 seconds
CPU Utilization: 2.40%
Memory Usage: 974.57 MB
[32.20485997200012, 2.4, 974.57421875]
[('european', 318532), ('mr', 210638), ('would', 181905), ('also', 180117), ('commission', 172768), ('must', 156850), ('president', 152132), ('union', 130292), ('states', 129472), ('member', 126221)]
Running experiment: Baselinedata/data_2.5GB.txt
Runtime: 413.93 seconds
CPU Utilization: 2.00%
Memory Usage: 2009.16 MB
[413.9263937473297, 2.0, 2009.16015625]
[('said', 2616235), ('one', 949399), ('would', 917205), ('new', 852788), ('also', 727930), ('last', 700206), ('people', 688742), ('mr', 659104), ('us', 643771), ('de', 643048)]
Running experiment: Baselinedata/data_16GB.txt


In [6]:
result1

{'data/data_300MB.txt': [32.20485997200012, 2.4, 974.57421875],
 'data/data_2.5GB.txt': [413.9263937473297, 2.0, 2009.16015625],
 'data/data_16GB.txt': []}

# Case 2: 
- Run as single task (without multi-thread and chunking
- Sorting Algorithm: python heapq nlargest ( Sort)

In [7]:
result2 = {}

In [8]:
for file_path in files:
    try: 
        k_most = KMostPopularWords(file_path)
        baseline_result = experiment_runner.run_experiment("Baseline"+file_path, k_most.get_top_k_words_baseline2, k)
        print(experiment_runner.measure)
        print(baseline_result)
        result2[file_path] = experiment_runner.measure
    except ValueError:
        result2[file_path] = []

Running experiment: Baselinedata/data_300MB.txt
Runtime: 34.83 seconds
CPU Utilization: 2.30%
Memory Usage: 2097.35 MB
[34.83403706550598, 2.3, 2097.34765625]
[('european', 318532), ('mr', 210638), ('would', 181905), ('also', 180117), ('commission', 172768), ('must', 156850), ('president', 152132), ('union', 130292), ('states', 129472), ('member', 126221)]
Running experiment: Baselinedata/data_2.5GB.txt
Running experiment: Baselinedata/data_16GB.txt


In [9]:
result2

{'data/data_300MB.txt': [34.83403706550598, 2.3, 2097.34765625],
 'data/data_2.5GB.txt': [],
 'data/data_16GB.txt': []}

# Case 3:
- Run as multi-tasks (with multi-thread and chunking)
- Sorting Algorithm python sort (Tim Sort)

In [10]:
file_path = "data/data_300MB.txt"
k_most = KMostPopularWords(file_path)

In [11]:
chunking_result = experiment_runner.run_experiment("Chunking", k_most.get_top_k_words_chunk, k, chunk_size)
chunking_result

Running experiment: Chunking
Runtime: 17.15 seconds
CPU Utilization: 2.40%
Memory Usage: 2280.12 MB


[('european', 316722),
 ('mr', 210160),
 ('would', 179735),
 ('also', 175907),
 ('-', 162852),
 ('must', 153791),
 ('commission', 138407),
 ('president,', 125700),
 ('member', 124360),
 ('like', 108992)]

In [12]:
result3 = {}

In [None]:
for file_path in files:
    try: 
        k_most = KMostPopularWords(file_path)
        baseline_result = experiment_runner.run_experiment("Chunking-" + file_path, k_most.get_top_k_words_chunk, k, chunk_size)
        print(experiment_runner.measure)
        print(baseline_result)
        result3[file_path] = experiment_runner.measure
    except ValueError:
        result3[file_path] = []

Running experiment: Chunking-data/data_300MB.txt
Runtime: 17.17 seconds
CPU Utilization: 2.40%
Memory Usage: 2846.54 MB
[17.17476201057434, 2.4, 2846.53515625]
[('european', 316722), ('mr', 210160), ('would', 179735), ('also', 175907), ('-', 162852), ('must', 153791), ('commission', 138407), ('president,', 125700), ('member', 124360), ('like', 108992)]
Running experiment: Chunking-data/data_2.5GB.txt
Runtime: 175.43 seconds
CPU Utilization: 2.30%
Memory Usage: 4650.02 MB
[175.4345681667328, 2.3, 4650.0234375]
[('said', 1575360), ('-', 1473479), ('would', 908990), ('one', 878036), ('new', 833235), ('said.', 726396), ('also', 716496), ('last', 688667), ('de', 640307), ('two', 615818)]
Running experiment: Chunking-data/data_16GB.txt


# Summary

## Speed