# Tests

In this notebook several tests are performed to analyze the scalability and fault tolerance of the application.

To achieve comparable test results, all tests (if not stated otherwise) are executed on a local spark cluster with the following allocation of resources:

| Item      | Resources        | Total Resources (in cluster) |
|-----------|-------------------|-------------------------------|
| Workers   | 3                 | 3                             |
| Executors | 2 per Worker      | 6                             |
| RAM         | 3 GB per Executor | 18 GB                         |
| Cores     | 1 per Executor    | 6                             |



In [31]:
import os

local_path = os.path.join(os.getcwd(), 'data')
parquet_path = os.path.join(local_path, 'parquet_test')
# result_folder_path = os.path.join(os.getcwd(), 'test_results_new')
# The following directory was used to store the test results, delivered with the project
result_folder_path = os.path.join(os.getcwd(), 'test_results_delivered')

spark_rest_api_url = "http://localhost:4040/api/v1/applications"

# The same sql query, which is used by superset to display the heatmap (aggregated version)
SQL_REQUEST_AGGREGATED = """
        SELECT ActionGeo_CountryCode AS ActionGeo_CountryCode,
               SUM(GoldsteinScaleSum)/SUM(EventCount) AS GoldsteinScaleAvg
        FROM
          (SELECT *
           FROM global_temp.GDELT_AGGR) AS virtual_table
        GROUP BY ActionGeo_CountryCode
    """

# The same sql query, which is used by superset to display the heatmap (non-aggregated version)
SQL_REQUEST_NON_AGGREGATED = """
        SELECT ActionGeo_CountryCode AS ActionGeo_CountryCode,
               AVG(GoldsteinScale) AS GoldsteinScaleAvg
        FROM
          (SELECT *
           FROM global_temp.GDELT
           WHERE ActionGeo_CountryCode IS NOT NULL
             AND GoldsteinScale IS NOT NULL) AS virtual_table
        GROUP BY ActionGeo_CountryCode
    """

In [2]:
from datetime import datetime, timedelta

# Choose time period for which to download the data
start_date = datetime.strptime('2015-07-01', '%Y-%m-%d')
end_date = datetime.strptime('2023-12-31', '%Y-%m-%d')

# Create a list of dates between start_date and end_date
date_list = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

In [3]:
import urllib.request

# Create a list containing download urls for each date
base_url = 'http://data.gdeltproject.org/gdeltv2/'
url_list = []
index = 0
url_list.append([])
month = date_list[0].month

# Create a nested list containing a list of months with the corresponding download urls
for date in date_list:
    if date.month != month:
        month = date.month
        index += 1
        url_list.append([])

    # Create the url and append it to the month list
    for x in range(0, 24):
        for y in range(0, 60, 15):
            date_tmp = date + timedelta(hours=x, minutes=y)
            url = base_url + date_tmp.strftime('%Y%m%d%H%M%S') + '.export.CSV.zip'
            url_list[index].append(url)

In [32]:
# Create the local directories if they don't exist yet
if not os.path.isdir(local_path):
    os.mkdir(local_path)

if not os.path.isdir(parquet_path):
    os.mkdir(parquet_path)
    
if not os.path.isdir(result_folder_path):
    os.mkdir(result_folder_path)

In [5]:
from pyspark.sql import SparkSession

# Start a spark session (see config folder for spark config)
spark = SparkSession.builder \
    .appName('Big Data Project') \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

# Define original data schema for csv files
schema = StructType([
    StructField("GlobalEventID", IntegerType(), True),
    StructField("Day", DateType(), True),
    StructField("MonthYear", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True),
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True),
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True),
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    StructField("DATEADDED", StringType(), True),
    StructField("SOURCEURL", StringType(), True),
])

In [7]:
import zipfile

def download_file(url):
    fname = url.split('/')[-1]
    folder_location = os.path.join(local_path, fname[:4], fname[4:6])

    # Download file from the specified url, if it doesn't exist yet
    if not os.path.isfile(os.path.join(folder_location, fname).replace(".zip", "")):
        try:
            urllib.request.urlretrieve(url, os.path.join(folder_location, fname))

            # Unzip zip file
            with zipfile.ZipFile(os.path.join(folder_location, fname), 'r') as zip_ref:
                zip_ref.extractall(folder_location)

            # Delete zip file
            os.remove(os.path.join(folder_location, fname))

        except Exception as e:
            print(f"An error occurred with file {fname}: {e}")

    else:
        print('File ' + fname + ' already exists')

In [8]:
import shutil
from concurrent.futures import ThreadPoolExecutor

# Download files and write them to parquet files in parallel for each month
# This is done in batches to allow simple addition of new months to already existing data
i = 0
for month_list in url_list:
    # Skip month if parquet file already exists
    if os.path.exists(os.path.join(parquet_path, str(i) + ".parquet")):
        i += 1
        continue

    year_folder = os.path.join(local_path, month_list[0].split('/')[-1][:4])
    month_folder = os.path.join(year_folder, month_list[0].split('/')[-1][4:6])

    if not os.path.isdir(year_folder):
        os.mkdir(year_folder)

    if not os.path.isdir(month_folder):
        os.mkdir(month_folder)

    # Download all files from the url list in parallel (threads = no. processors on machine * 5)
    with ThreadPoolExecutor() as executor:
        executor.map(download_file, month_list)

    # Read all csv files of one month into a spark dataframe
    df = spark.read.csv(month_folder, sep='\t', header=False, schema=schema, dateFormat='yyyyMMdd')

    # Write the data of one month into a parquet file
    df.write.parquet(os.path.join(parquet_path, str(i) + ".parquet"), mode='overwrite')
    
    i += 1
    
    # Delete the csv files to free up disk space
    shutil.rmtree(month_folder)

In [9]:
import pandas as pd
import requests

# Get the csv and parquet file sizes via the input/output bytes of the stages from the spark rest api
result_file_path = os.path.join(result_folder_path, 'data_size.csv')

if not os.path.isdir(result_folder_path):
    os.mkdir(result_folder_path)

# Fetch the list of applications to get the application id
apps_response = requests.get(spark_rest_api_url)
apps = apps_response.json()
app_id = apps[0]['id']

# Get stages information for the application (1 stage per parquet file write)
stages_response = requests.get(f"{spark_rest_api_url}/{app_id}/stages")
stages_data = stages_response.json()

stage_result = {}

# Get necessary information of each stage
for stage in stages_data:
    stage_result[stage['stageId']] = {
        'status': stage['status'],
        'input_data': stage['inputBytes'],
        'output_data': stage['outputBytes']
    }
    
df_result = pd.DataFrame.from_dict(stage_result, orient='index')

# Write the result to a csv file to use them later
if not os.path.isfile(result_file_path):
    df_result.to_csv(result_file_path, sep=';')

In [10]:
from pyspark.sql.functions import broadcast
from pyspark.sql import functions as F


# Method to run the non-aggregated version of the program repeatedly in the test loop
def run_non_aggregated(df_base):
    # CSV file containing a mapping from FIPS10-4 country codes to ISO 3166-1 alpha-2 country codes (necessary for superset heatmap)
    mapping_file_path = os.path.join(os.getcwd(), 'util', 'country_code_mapping.csv')

    # Load mapping file outside of spark (small dataset)
    df_mapping = spark.read.csv(mapping_file_path, sep=';', header=True, inferSchema=True).select(
        F.col('FIPS 10-4'),
        F.col('ISO 3166-1')
    )

    # Map the country codes
    df_non_aggregated = df_base.join(broadcast(df_mapping), df_base['ActionGeo_CountryCode'] == df_mapping['FIPS 10-4'],
                                     'left_outer')

    df_non_aggregated = df_non_aggregated \
        .withColumn('ActionGeo_CountryCode', F.col('ISO 3166-1')) \
        .drop('ISO 3166-1') \
        .drop('FIPS 10-4')

    # Load data, trigger caching and create a global temp view (as it would be necessary to use the data with superset)
    df_non_aggregated.cache()
    df_non_aggregated.count()
    df_non_aggregated.createOrReplaceGlobalTempView("GDELT")
    
    return df_non_aggregated

In [11]:
# Method to run the aggregated version of the program repeatedly in the test loop
def run_aggregated(df_base):
    # Select only relevant columns for the aggregation
    df_selection = df_base.select(
        F.col('Day'),
        F.col('ActionGeo_CountryCode'),
        F.col('GoldsteinScale')
    )

    # Remove rows that contain null values, which would distort the aggregation results 
    df_selection = df_selection.na.drop()
    
    # Aggregate the values by date and country so there is only one value per country per day
    df_aggregated = df_selection.groupBy('Day', 'ActionGeo_CountryCode').agg(
        F.sum('GoldsteinScale').alias('GoldsteinScaleSum'),
        F.count('*').alias('EventCount')
    )

    # CSV file containing a mapping from FIPS10-4 country codes to ISO 3166-1 alpha-2 country codes (necessary for superset heatmap)
    mapping_file_path = os.path.join(os.getcwd(), 'util', 'country_code_mapping.csv')

    # Load mapping file outside of spark (small dataset)
    df_mapping = spark.read.csv(mapping_file_path, sep=';', header=True, inferSchema=True).select(
        F.col('FIPS 10-4'),
        F.col('ISO 3166-1')
    )

    # Map the country codes
    df_aggregated = df_aggregated.join(broadcast(df_mapping),
                                       df_aggregated['ActionGeo_CountryCode'] == df_mapping['FIPS 10-4'],
                                       'left_outer')

    df_aggregated = df_aggregated \
        .withColumn('ActionGeo_CountryCode', F.col('ISO 3166-1')) \
        .drop('ISO 3166-1') \
        .drop('FIPS 10-4')

    # Load data, trigger caching and create a global temp view (as it would be necessary to use the data with superset)
    df_aggregated.cache()
    df_aggregated.count()
    df_aggregated.createOrReplaceGlobalTempView("GDELT_AGGR")
    
    return df_aggregated

In [12]:
import requests

# Method to get the current cache information from the spark rest api
def get_cache_information():
    
    # Fetch the list of applications to get the application id
    apps_response = requests.get(spark_rest_api_url)
    apps = apps_response.json()
    app_id = apps[0]['id']
    
    # Get storage information for the application
    storage_response = requests.get(f"{spark_rest_api_url}/{app_id}/storage/rdd")
    storage_data = storage_response.json()
    
    # Only one dataframe is cached at a time, so only the first entry is relevant
    return storage_data[0]['memoryUsed'], storage_data[0]['diskUsed']

In [13]:
import time
from pyhive import hive
import pandas as pd

def average_sql_request_response_time(sql, n=10):
    # Create connection to thrift server
    with hive.connect(host='localhost', port=10000, username='spark') as connection:
        request_start_time = time.time()

        for _ in range(n):
            # Send request to thrift server
            cursor = connection.cursor()
            cursor.execute(sql)
            result = cursor.fetchall()

        request_end_time = time.time()

    return (request_end_time - request_start_time) / n

In [14]:
import os
import pandas as pd

def write_result_to_csv(result_dict, file_name):
    
    result_file_path = os.path.join(result_folder_path, file_name)
    
    df_result = pd.DataFrame([result_dict])

    # Check if the file exists
    file_exists = os.path.isfile(result_file_path)
    
    # Write the test result to a csv file, append if file exists, create new if it doesn't
    df_result.to_csv(result_file_path, sep=';', mode='a' if file_exists else 'w', header=not file_exists)

In [15]:
from py4j.java_gateway import java_import

# Retrieve the spark context from the current spark session
sc = spark.sparkContext

# Import the HiveThriftServer2 class using the JVM instance of the spark context
java_import(sc._jvm, "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2")

# Dummy java arguments for main method
java_args = sc._gateway.new_array(sc._gateway.jvm.java.lang.String, 0)

# Start the thrift server by calling the main method of the imported class
sc._jvm.org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.main(java_args)

## Scaling Data Volume

The following tests are performed to analyze how increasing the data volume affects the performance of the application.

Both versions of the program (aggregated and non-aggregated) are repeatedly executed in a loop with an increasing amount of data. In each test run, performance metrics and cache information are collected and persisted to a csv file, to be used later for analysis.

Contrary to `main.py` the aggregated and non-aggregated version are completely separated to test the individual performance of both versions.


### Data Volume
The data volume is increased by incrementing the number of parquet files (each representing a month of data) in every test run.

There a are a total of 102 parquet files (8.5 years of data).

Up to a full year of data (12 parquet files), tests are run in increments of 1 month (1 parquet file) to get a more granular view of the performance effects.
Afterward, tests are run with increasing increments to analyze the performance effects at scale, while keeping the number of test runs low.
The increment starts at 6 months and increases by 6 months with every test run (+6, +12, +18, ... parquet files), until the maximum of 8.5 years of data (102 parquet files) is reached.

The tests are therefore conducted with the following numbers of parquet files: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18, 30, 48, 72, 102.

### Performance
The effect on the performance is analyzed by measuring different metrics:
- **Pre-processing turnaround time**: The amount of time it takes to load the data into spark, conduct the necessary pre-processing & cache the results.
- **Query response time**: The amount of time it takes to process and return the results of a single sql query, which is sent to the thrift server. An average of 10 sequential queries is calculated for more stable results.

In [16]:
# Create a list of all parquet file paths
parquet_path_list = []

# Number of parquet files in the directory
for i in range(0, 102):
    parquet_path_list.append(os.path.join(parquet_path, str(i) + ".parquet"))

parquet_file_test_cases = list(range(1, 13)) + [18, 30, 48, 72, 102]

In [17]:
import time

exception_occurred = False

# Run the aggregated version of the main logic (data loading & pre-processing) in a loop with increasing data volume
for i in parquet_file_test_cases:
    if not exception_occurred:
        try:    
            # Start timer
            start_time = time.time()
            
            # Read in the parquet files relevant for the current run
            df = spark.read.parquet(*parquet_path_list[:i])
                
            # Run the aggregated version of the main logic
            df_aggregated = run_aggregated(df)
            
            # Stop timer
            end_time = time.time()
            
            # Calculate duration of the run
            duration = end_time - start_time
            
            # Get cache information
            memory_usage, disk_usage = get_cache_information()
            
            # Get the average response time of an sql query
            avg_req_time = average_sql_request_response_time(SQL_REQUEST_AGGREGATED)
        
            # Combine test results of the current run
            result = {
                'status': 'COMPLETE',
                'last_file_index': i-1, # This is necessary to calculate the size of the data used for this test run
                'duration': duration,
                'memory_usage': memory_usage,
                'disk_usage': disk_usage,
                'avg_req_time': avg_req_time
            }
            
            print(f"{i} parquet files: Duration: {duration}s, memory usage: {memory_usage}B, disk usage: {disk_usage}B, average response time: {avg_req_time}s")
        
        # If an exception occurs, the test run should fail 
        except Exception as e:
            print(f"An error occurred: {e}")
            exception_occurred = True
    
    if exception_occurred:
        result = {
            'status': 'FAILED',
            'last_file_index': i-1,
            'duration': 0,
            'memory_usage': 0,
            'disk_usage': 0,
            'avg_req_time': 0
        }
        
    # Test results are persisted every run to avoid losing them in case of a crash
    write_result_to_csv(result, 'test_data_volume_aggregated.csv')
    
    # Remove from cache to prevent interference with the next run
    df_aggregated.unpersist()

1 parquet files: Duration: 14.413100719451904s, memory usage: 212952B, disk usage: 0B, average response time: 0.2546949625015259s
2 parquet files: Duration: 12.536877870559692s, memory usage: 392008B, disk usage: 0B, average response time: 0.25118160247802734s
3 parquet files: Duration: 14.607834815979004s, memory usage: 570016B, disk usage: 0B, average response time: 0.2640686988830566s
4 parquet files: Duration: 15.318191766738892s, memory usage: 739032B, disk usage: 0B, average response time: 0.15903708934783936s
5 parquet files: Duration: 16.810499906539917s, memory usage: 902976B, disk usage: 0B, average response time: 0.13215315341949463s
6 parquet files: Duration: 19.658199548721313s, memory usage: 1078560B, disk usage: 0B, average response time: 0.11190409660339355s
7 parquet files: Duration: 23.625364780426025s, memory usage: 1233000B, disk usage: 0B, average response time: 0.1268925905227661s
8 parquet files: Duration: 25.880151510238647s, memory usage: 1387568B, disk usage: 

In [18]:
import time

exception_occurred = False

# Run the non-aggregated version of the main logic (data loading & pre-processing) in a loop with increasing data volume
for i in parquet_file_test_cases:
    if not exception_occurred:
        try:
            # Start timer
            start_time = time.time()

            # Read in the parquet files relevant for the current run
            df = spark.read.parquet(*parquet_path_list[:i])

            # Run the non-aggregated version of the main logic
            df_non_aggregated = run_non_aggregated(df)

            # Stop timer
            end_time = time.time()

            # Calculate duration of the run
            duration = end_time - start_time

            # Get cache information
            memory_usage, disk_usage = get_cache_information()

            # Get the average response time of an sql query
            avg_req_time = average_sql_request_response_time(SQL_REQUEST_NON_AGGREGATED)

            # Combine test results of the current run
            result = {
                'status': 'COMPLETE',
                'last_file_index': i - 1,  # This is necessary to calculate the size of the data used for this test run
                'duration': duration,
                'memory_usage': memory_usage,
                'disk_usage': disk_usage,
                'avg_req_time': avg_req_time
            }

            print(f"{i} parquet files: Duration: {duration}s, memory usage: {memory_usage}B, disk usage: {disk_usage}B, average response time: {avg_req_time}s")

        # If an exception occurs, the test run should fail 
        except Exception as e:
            print(f"An error occurred: {e}")
            exception_occurred = True

    if exception_occurred:
        result = {
            'status': 'FAILED',
            'last_file_index': i - 1,
            'duration': 0,
            'memory_usage': 0,
            'disk_usage': 0,
            'avg_req_time': 0
        }

    # Test results are persisted every run to avoid losing them in case of a crash
    write_result_to_csv(result, 'test_data_volume_non_aggregated.csv')

    # Remove from cache to prevent interference with the next run
    df_non_aggregated.unpersist()

1 parquet files: Duration: 34.120455265045166s, memory usage: 1273540248B, disk usage: 0B, average response time: 0.3191996574401855s
2 parquet files: Duration: 49.46705436706543s, memory usage: 2475682816B, disk usage: 0B, average response time: 0.4143723964691162s
3 parquet files: Duration: 61.652783155441284s, memory usage: 3727293664B, disk usage: 0B, average response time: 0.5487101793289184s
4 parquet files: Duration: 77.62182712554932s, memory usage: 4952817680B, disk usage: 0B, average response time: 0.6659678459167481s
5 parquet files: Duration: 96.44000601768494s, memory usage: 6191491184B, disk usage: 0B, average response time: 0.8035871982574463s
6 parquet files: Duration: 113.14305400848389s, memory usage: 7367461568B, disk usage: 0B, average response time: 0.934669041633606s
7 parquet files: Duration: 130.55651116371155s, memory usage: 8589660192B, disk usage: 0B, average response time: 1.0509275913238525s
8 parquet files: Duration: 153.46874284744263s, memory usage: 9717

## Scaling Load

The following tests are performed to analyze how an increased load influences the performance of the application.

In the given use case, an increasing amount of users using the dashboard at the same time or an increasing amount of dashboard elements (e.g. charts, tables, etc.) could be considered an increase in load. In both cases the number of concurrent queries, that are sent from superset to the thrift server, would increase. For that reason, we define load as the number of queries that are sent to the thrift server in parallel.

Once the program is executed and the data is cached, an incrementing number of queries are sent to the thrift server in parallel. In each test run performance metrics are measured and persisted to a csv file. The tests are executed for both versions of the program (aggregated and non-aggregated) separately.


### Load
The number of queries sent to the thrift server in parallel is increased in increments of 10, starting at 10 up to 100. Afterward, it is increased in increments of 50, starting at 150 up to 500. Finally, the number of queries is increased in increments of 100, starting at 600 up to 1000.

To simulate, that queries are sent in parallel (e.g. by different users), a separate thread is started for each query. Every thread establishes an isolated connection to the thrift server and sends a single query.


### Performance
The effect on the performance is analyzed by measuring the following metric:
- **Average query response time**: The amount of time it takes to process and return the results of an sql query sent to the thrift server under the given load.

### Data Volume
The data volume is kept constant to test the effect of additional load in an isolated way. 1 parquet file (1 month of data) is used as a baseline for all test runs to ensure, that the cached data fits completely into the memory of the spark cluster and to reduce the runtime of the tests.

In [19]:
import threading
import time
import queue


def run_query(sql, data_queue, stop_event):
    # Send a single query to the thrift server and store the response time
    try:
        if not stop_event.is_set():
            req_time = average_sql_request_response_time(sql, 1)
            data_queue.put(req_time)
    # Stop other threads, when an exception occurs and save the exception
    except Exception as e:
        data_queue.put(e)
        stop_event.set()


def run_queries_parallel(sql, n):
    threads = []
    data_queue = queue.Queue()
    stop_event = threading.Event()

    # Start n threads to run n queries in parallel
    for _ in range(n):
        t = threading.Thread(target=run_query, args=(sql, data_queue, stop_event))
        threads.append(t)
        t.start()

    # Wait until all threads are finished
    for t in threads:
        t.join()

    # Calculate the average response time
    total = 0
    while not data_queue.empty():
        result = data_queue.get()
        if isinstance(result, Exception):
            # If an exception occurred in one of the threads, the test run should fail
            raise result
        total += result

    return total / n

In [20]:
# Number of parquet files used in the test
files = 1
# Number of queries sent in parallel
query_test_cases = list(range(10, 101, 10)) + list(range(150, 501, 50)) + list(range(600, 1001, 100))

In [21]:
# Run the aggregated version of the program as measurement object for the load tests
df = spark.read.parquet(*parquet_path_list[:files])
df_aggregated = run_aggregated(df)

exception_occurred = False

for i in query_test_cases:
    if not exception_occurred:
        # Execute n queries in parallel and calculate the average response time
        try:
            avg_req_time = run_queries_parallel(SQL_REQUEST_AGGREGATED, i)
            result = {
                'status': 'COMPLETE',
                'last_file_index': files-1,
                'parallel_queries': i,
                'avg_req_time': avg_req_time
            }
            print(f"Average response time with {i} parallel queries: {avg_req_time}s")
            
        # If an exception occurs, the test run should fail 
        except Exception as e:
            print(f"An error occurred: {e}")
            exception_occurred = True
    
    if exception_occurred:
        result = {
            'status': 'FAILED',
            'last_file_index': files-1,
            'parallel_queries': i,
            'avg_req_time': 0
        }
        
    # Test results are persisted every run to avoid losing them in case of a crash
    write_result_to_csv(result, 'test_load_aggregated.csv')
    
df_aggregated.unpersist()

Average response time with 10 parallel queries: 0.21779217720031738s
Average response time with 20 parallel queries: 0.36007262468338014s
Average response time with 30 parallel queries: 0.29999205271402996s
Average response time with 40 parallel queries: 0.4859184265136719s
Average response time with 50 parallel queries: 0.5870481777191162s
Average response time with 60 parallel queries: 0.7180324633916219s
Average response time with 70 parallel queries: 0.6845898219517299s
Average response time with 80 parallel queries: 0.7939853012561798s
Average response time with 90 parallel queries: 1.063152543703715s
Average response time with 100 parallel queries: 1.24916330575943s
Average response time with 150 parallel queries: 1.6525200843811034s
Average response time with 200 parallel queries: 2.004344297647476s
Average response time with 250 parallel queries: 2.4741074285507203s
Average response time with 300 parallel queries: 3.037146798769633s
Average response time with 350 parallel queri

DataFrame[Day: date, ActionGeo_CountryCode: string, GoldsteinScaleSum: double, EventCount: bigint]

In [22]:
# Run the non-aggregated version of the program as measurement objects for the load tests
df = spark.read.parquet(*parquet_path_list[:files])
df_non_aggregated = run_non_aggregated(df)

exception_occurred = False

for i in query_test_cases:
    if not exception_occurred:
        # Execute n queries in parallel and calculate the average response time
        try:
            avg_req_time = run_queries_parallel(SQL_REQUEST_NON_AGGREGATED, i)
            result = {
                'status': 'COMPLETE',
                'last_file_index': files-1,
                'parallel_queries': i,
                'avg_req_time': avg_req_time
            }
            print(f"Average response time with {i} parallel queries: {avg_req_time}s")
            
        # If an exception occurs, the test run should fail 
        except Exception as e:
            print(f"An error occurred: {e}")
            exception_occurred = True
    
    if exception_occurred:
        result = {
            'status': 'FAILED',
            'last_file_index': files-1,
            'parallel_queries': i,
            'avg_req_time': 0
        }
            
    # Test results are persisted every run to avoid losing them in case of a crash
    write_result_to_csv(result, 'test_load_non_aggregated.csv')

df_non_aggregated.unpersist()

Average response time with 10 parallel queries: 1.5785735368728637s
Average response time with 20 parallel queries: 3.267963206768036s
Average response time with 30 parallel queries: 4.547449461619059s
Average response time with 40 parallel queries: 5.626710474491119s
Average response time with 50 parallel queries: 6.816288423538208s
Average response time with 60 parallel queries: 8.923525830109915s
Average response time with 70 parallel queries: 12.24931754044124s
Average response time with 80 parallel queries: 14.468654876947403s
Average response time with 90 parallel queries: 17.621626607577006s
Average response time with 100 parallel queries: 20.29620223760605s
Average response time with 150 parallel queries: 38.75385047276815s
Average response time with 200 parallel queries: 54.13535201430321s
Average response time with 250 parallel queries: 65.63901019477844s
Average response time with 300 parallel queries: 97.21682971000672s
An error occurred: TExecuteStatementResp(status=TStatu

DataFrame[GlobalEventID: int, Day: date, MonthYear: int, Year: int, FractionDate: float, Actor1Code: string, Actor1Name: string, Actor1CountryCode: string, Actor1KnownGroupCode: string, Actor1EthnicCode: string, Actor1Religion1Code: string, Actor1Religion2Code: string, Actor1Type1Code: string, Actor1Type2Code: string, Actor1Type3Code: string, Actor2Code: string, Actor2Name: string, Actor2CountryCode: string, Actor2KnownGroupCode: string, Actor2EthnicCode: string, Actor2Religion1Code: string, Actor2Religion2Code: string, Actor2Type1Code: string, Actor2Type2Code: string, Actor2Type3Code: string, IsRootEvent: int, EventCode: string, EventBaseCode: string, EventRootCode: string, QuadClass: int, GoldsteinScale: float, NumMentions: int, NumSources: int, NumArticles: int, AvgTone: float, Actor1Geo_Type: int, Actor1Geo_FullName: string, Actor1Geo_CountryCode: string, Actor1Geo_ADM1Code: string, Actor1Geo_ADM2Code: string, Actor1Geo_Lat: float, Actor1Geo_Long: float, Actor1Geo_FeatureID: string

## Scaling Resources

The following tests are performed to analyze how increasing the resources of the spark cluster influences the performance of the application.

In the given case, the relevant resources are the number of CPU cores and the amount of RAM, which the spark cluster can use to process & cache the data.

Scaling resources in the big data context is usually achieved by adding additional nodes consisting of commodity hardware to a cluster (horizontal scaling). To simulate the effect of adding additional nodes to the cluster, both versions of the program (aggregated and non-aggregated) are executed with an increasing number of worker nodes. In each test run, performance metrics and cache information are collected and persisted to a csv file.

### Resources
The resources are scaled by manually changing the number of worker nodes in `docker-compose.yaml` and adjusting the total number of executors in the cluster accordingly in `spark-defaults.conf`. After changing the configuration the setup is restarted and the tests are executed again with the adapted spark cluster.
 
The tests are conducted with the following resource configurations:


| No. Workers               | Total No. Executors (in cluster) | Total RAM (in cluster) | Total No. Cores (in cluster)       |
|---------------------------|----------------------------------|------------------------------|------------------------------------|
| 1                         | 2                                | 6 GB                         | 2                                  |
| 2                         | 4                                | 12 GB                        | 4                                  |
| 3                         | 6                                | 18 GB                        | 6                                  |

### Performance
The measured performance metric are the same as in the data volume scalability tests:
- **Pre-processing turnaround time**
- **Query response time**

### Data Volume
The data volume is kept constant to test the effect of adding additional resources in an isolated way.
 
For both versions of the program, a different number of parquet files are included in the test runs, so the amount of included data is aligned with the characteristics of the different versions. This is necessary to ensure that the pre-processed data is partitioned and cached across all nodes, so it can be processed in parallel by the executors (this is not the case for 1 parquet file). Furthermore the resulting data must fit into memory, so the result is not distorted by the additional overhead of swapping data between memory and disk.

Therefore, the following number of parquet files are used in the test runs:
- **Aggregated version**: 6 parquet files (6 months of data) are used, so the cached result of the pre-processing is partitioned across all nodes.
- **Non-aggregated version**: 2 parquet files (2 months of data) are used because it's the maximum amount which fits into memory of 1 worker node with 6 GB of RAM.


In [23]:
# Manually adjusted every run
workers = 0

In [24]:
import time
# Number of parquet files used in the test
files = 6

# Run the aggregated version of the main logic (data loading & pre-processing) with the current number of workers
try:
    # Start timer
    start_time = time.time()

    # Read in the parquet files
    df = spark.read.parquet(*parquet_path_list[:files])

    # Run the aggregated version of the main logic
    df_aggregated = run_aggregated(df)

    # Stop timer
    end_time = time.time()

    # Calculate duration of the run
    duration = end_time - start_time

    # Get cache information
    memory_usage, disk_usage = get_cache_information()

    # Get the average response time of an sql query
    avg_req_time = average_sql_request_response_time(SQL_REQUEST_AGGREGATED)

    # Combine test results of the current run
    result = {
        'status': 'COMPLETE',
        'workers': workers,
        'last_file_index': files,
        'duration': duration,
        'memory_usage': memory_usage,
        'disk_usage': disk_usage,
        'avg_req_time': avg_req_time
    }

    print(f"{workers} workers: Duration: {duration}s, memory usage: {memory_usage}B, disk usage: {disk_usage}B, average response time: {avg_req_time}s")

# If an exception occurs, the test run should fail 
except Exception as e:
    print(f"An error occurred: {e}")
    result = {
        'status': 'FAILED',
        'workers': workers,
        'last_file_index': files,
        'duration': 0,
        'memory_usage': 0,
        'disk_usage': 0,
        'avg_req_time': 0
    }

# Test results are persisted every run to avoid losing them in case of a crash
write_result_to_csv(result, 'test_resources_aggregated.csv')

# Remove from cache to prevent interference with the next run
df_aggregated.unpersist()

0 workers: Duration: 20.550599336624146s, memory usage: 1078536B, disk usage: 0B, average response time: 0.10387980937957764s


DataFrame[Day: date, ActionGeo_CountryCode: string, GoldsteinScaleSum: double, EventCount: bigint]

In [25]:
import time
# Number of parquet files used in the test
files = 2

# Run the non-aggregated version of the main logic (data loading & pre-processing) with the current number of workers
try:
    # Start timer
    start_time = time.time()

    # Read in the parquet files
    df = spark.read.parquet(*parquet_path_list[:files])

    # Run the non-aggregated version of the main logic
    df_non_aggregated = run_non_aggregated(df)

    # Stop timer
    end_time = time.time()

    # Calculate duration of the run
    duration = end_time - start_time

    # Get cache information
    memory_usage, disk_usage = get_cache_information()

    # Get the average response time of an sql query
    avg_req_time = average_sql_request_response_time(SQL_REQUEST_NON_AGGREGATED)

    # Combine test results of the current run
    result = {
        'status': 'COMPLETE',
        'workers': workers,
        'last_file_index': files,
        'duration': duration,
        'memory_usage': memory_usage,
        'disk_usage': disk_usage,
        'avg_req_time': avg_req_time
    }

    print(f"{workers} workers: Duration: {duration}s, memory usage: {memory_usage}B, disk usage: {disk_usage}B, average response time: {avg_req_time}s")

# If an exception occurs, the test run should fail 
except Exception as e:
    print(f"An error occurred: {e}")
    result = {
        'status': 'FAILED',
        'workers': workers,
        'last_file_index': files,
        'duration': 0,
        'memory_usage': 0,
        'disk_usage': 0,
        'avg_req_time': 0
    }

# Test results are persisted every run to avoid losing them in case of a crash
write_result_to_csv(result, 'test_resources_non_aggregated.csv')

# Remove from cache to prevent interference with the next run
df_non_aggregated.unpersist()

0 workers: Duration: 47.30111622810364s, memory usage: 2475682816B, disk usage: 0B, average response time: 0.3795547723770142s


DataFrame[GlobalEventID: int, Day: date, MonthYear: int, Year: int, FractionDate: float, Actor1Code: string, Actor1Name: string, Actor1CountryCode: string, Actor1KnownGroupCode: string, Actor1EthnicCode: string, Actor1Religion1Code: string, Actor1Religion2Code: string, Actor1Type1Code: string, Actor1Type2Code: string, Actor1Type3Code: string, Actor2Code: string, Actor2Name: string, Actor2CountryCode: string, Actor2KnownGroupCode: string, Actor2EthnicCode: string, Actor2Religion1Code: string, Actor2Religion2Code: string, Actor2Type1Code: string, Actor2Type2Code: string, Actor2Type3Code: string, IsRootEvent: int, EventCode: string, EventBaseCode: string, EventRootCode: string, QuadClass: int, GoldsteinScale: float, NumMentions: int, NumSources: int, NumArticles: int, AvgTone: float, Actor1Geo_Type: int, Actor1Geo_FullName: string, Actor1Geo_CountryCode: string, Actor1Geo_ADM1Code: string, Actor1Geo_ADM2Code: string, Actor1Geo_Lat: float, Actor1Geo_Long: float, Actor1Geo_FeatureID: string

## Fault Tolerance

The following tests are performed to analyze how the application behaves in case of a failure and how it affects the performance.

In the given case, a failure could be caused by a node in the spark cluster failing or by a network partition which prevents the nodes from communicating with each other, effectively making the node unavailable to be used in the cluster. To simulate a node failure, both versions of the program (aggregated and non-aggregated) are executed until the data is cached and then one or more nodes are stopped, so the cached partitions on these nodes are lost and the data has to be re-loaded from disk and re-calculated once it's needed. To trigger this, an sql request is sent to the thrift server, which contains an aggregation that requires processing of the complete dataset. In each test run, performance metrics are collected and persisted to a csv file.

### Failures
The failure of nodes is simulated by manually stopping the worker nodes in docker once the data has been cached. The following failure scenarios are tested:
- **0 Nodes fail**: No nodes are stopped after the data is cached.
- **1 Node fails**: One node is stopped after the data is cached.
- **2 Nodes fail**: Two nodes are stopped after the data is cached.

After every test run, the setup is restarted and the tests are executed again with the next failure scenario.

### Performance
The effect on the performance is analyzed by measuring the following metric:
- **Query response time**: The amount of time it takes to process and return the results of an sql query under the condition that one or more nodes are unavailable.

### Data Volume
The data volume is kept constant to test the effect of a node failure in an isolated way. 

The amount of data must is chosen carefully, to ensure that the result of the pre-processing is partitioned and cached across all nodes, so the failure of a single node leads to an actuaL loss of cached data. Furthermore the resulting data must fit into the memory of the remaining nodes, so the result is not distorted by the additional overhead of swapping data between memory and disk.

Therefore, the number of parquet files used in the tests are the same as in the resource scalability tests:
- **Aggregated version**: 6 parquet files (6 month of data)
- **Non-aggregated version**: 2 parquet files (2 month of data)

In [26]:
# Manually adjusted every run
stopped_workers = 0

In [27]:
# Number of parquet files used in the test
files = 6

# Run the aggregated version of the program as measurement object for the fault tolerance test
df = spark.read.parquet(*parquet_path_list[:files])
df_aggregated = run_aggregated(df)

Stop the number of worker nodes, specified in `stopped_workers`

In [28]:
try:
    # Get the response time of a single sql request -> The following requests would already be processed by using the cached data
    avg_req_time = average_sql_request_response_time(SQL_REQUEST_AGGREGATED, 1)

    # Combine test results of the current run
    result = {
        'status': 'COMPLETE',
        'stopped_workers': stopped_workers,
        'last_file_index': files,
        'avg_req_time': avg_req_time
    }

    print(f"{stopped_workers} stopped workers: Response time: {avg_req_time}s")

# If an exception occurs, the test run should fail 
except Exception as e:
    print(f"An error occurred: {e}")
    result = {
        'status': 'FAILED',
        'stopped_workers': stopped_workers,
        'last_file_index': files,
        'avg_req_time': 0
    }

# Test results are persisted every run to avoid losing them in case of a crash
write_result_to_csv(result, 'test_fault_tolerance_aggregated.csv')

# Remove from cache to prevent interference with the next run
df_aggregated.unpersist()

0 stopped workers: Response time: 0.10315108299255371s


DataFrame[Day: date, ActionGeo_CountryCode: string, GoldsteinScaleSum: double, EventCount: bigint]

Restart the setup, so all workers are available again

In [29]:
# Number of parquet files used in the test
files = 2

# Run the non-aggregated version of the program as measurement object for the fault tolerance test
df = spark.read.parquet(*parquet_path_list[:files])
df_non_aggregated = run_non_aggregated(df)

Stop the number of worker nodes, specified in `stopped_workers`

In [30]:
try:
    # Get the response time of a single sql request -> The following requests would already be processed by using the cached data
    avg_req_time = average_sql_request_response_time(SQL_REQUEST_NON_AGGREGATED, 1)

    # Combine test results of the current run
    result = {
        'status': 'COMPLETE',
        'stopped_workers': stopped_workers,
        'last_file_index': files,
        'avg_req_time': avg_req_time
    }

    print(f"{stopped_workers} stopped workers: Response time: {avg_req_time}s")

# If an exception occurs, the test run should fail 
except Exception as e:
    print(f"An error occurred: {e}")
    result = {
        'status': 'FAILED',
        'stopped_workers': stopped_workers,
        'last_file_index': files,
        'avg_req_time': 0
    }

# Test results are persisted every run to avoid losing them in case of a crash
write_result_to_csv(result, 'test_fault_tolerance_non_aggregated.csv')

# Remove from cache to prevent interference with the next run
df_non_aggregated.unpersist()

0 stopped workers: Response time: 0.4099743366241455s


DataFrame[GlobalEventID: int, Day: date, MonthYear: int, Year: int, FractionDate: float, Actor1Code: string, Actor1Name: string, Actor1CountryCode: string, Actor1KnownGroupCode: string, Actor1EthnicCode: string, Actor1Religion1Code: string, Actor1Religion2Code: string, Actor1Type1Code: string, Actor1Type2Code: string, Actor1Type3Code: string, Actor2Code: string, Actor2Name: string, Actor2CountryCode: string, Actor2KnownGroupCode: string, Actor2EthnicCode: string, Actor2Religion1Code: string, Actor2Religion2Code: string, Actor2Type1Code: string, Actor2Type2Code: string, Actor2Type3Code: string, IsRootEvent: int, EventCode: string, EventBaseCode: string, EventRootCode: string, QuadClass: int, GoldsteinScale: float, NumMentions: int, NumSources: int, NumArticles: int, AvgTone: float, Actor1Geo_Type: int, Actor1Geo_FullName: string, Actor1Geo_CountryCode: string, Actor1Geo_ADM1Code: string, Actor1Geo_ADM2Code: string, Actor1Geo_Lat: float, Actor1Geo_Long: float, Actor1Geo_FeatureID: string

## Combined Scalability Test

In [ ]:
# TODO:
# Combined scalability test
# Comment all notebooks