In [1]:
import os

local_path = os.path.join(os.getcwd(), 'data')
parquet_path = os.path.join(local_path, 'parquet_scaling')

In [2]:
from datetime import datetime, timedelta

# Choose time period for which to download the data
start_date = datetime.strptime('2015-03-01', '%Y-%m-%d')
end_date = datetime.strptime('2023-12-31', '%Y-%m-%d')

# Create a list of dates between start_date and end_date
date_list = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

In [3]:
import urllib.request

# Create a list containing download urls for each date
base_url = 'http://data.gdeltproject.org/gdeltv2/'
url_list = []
index = 0
url_list.append([])
month = date_list[0].month

# Create a nested list containing a list of months with the corresponding download urls
for date in date_list:
    if date.month != month:
        month = date.month
        index += 1
        url_list.append([])

    # Create the url and append it to the month list
    for x in range(0, 24):
        for y in range(0, 60, 15):
            date_tmp = date + timedelta(hours=x, minutes=y)
            url = base_url + date_tmp.strftime('%Y%m%d%H%M%S') + '.export.CSV.zip'
            url_list[index].append(url)

In [4]:
# Create the local directory for the data if it doesn't exist
if not os.path.isdir(local_path):
    os.mkdir(local_path)

if not os.path.isdir(parquet_path):
    os.mkdir(parquet_path)

In [5]:
from pyspark.sql import SparkSession

# Start a spark session (see config folder for spark config)
spark = SparkSession.builder \
    .appName('Big Data Project') \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

# Define original data schema for csv files
schema = StructType([
    StructField("GlobalEventID", IntegerType(), True),
    StructField("Day", DateType(), True),
    StructField("MonthYear", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True),
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True),
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True),
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    StructField("DATEADDED", StringType(), True),
    StructField("SOURCEURL", StringType(), True),
])

In [7]:
import zipfile


def download_file(url):
    fname = url.split('/')[-1]
    folder_location = os.path.join(local_path, fname[:4], fname[4:6])

    # Download file from the specified url, if it doesn't exist yet
    if not os.path.isfile(os.path.join(folder_location, fname).replace(".zip", "")):
        try:
            urllib.request.urlretrieve(url, os.path.join(folder_location, fname))

            # Unzip zip file
            with zipfile.ZipFile(os.path.join(folder_location, fname), 'r') as zip_ref:
                zip_ref.extractall(folder_location)

            # Delete zip file
            os.remove(os.path.join(folder_location, fname))

        except Exception as e:
            print(f"An error occurred with file {fname}: {e}")

    else:
        print('File ' + fname + ' already exists')

In [8]:
import shutil
from concurrent.futures import ThreadPoolExecutor

# Download files and write them to parquet files in parallel for each month
# This is done in batches to allow simple addition of new months to already existing data
i = 0
for month_list in url_list:
    # Skip month if parquet file already exists
    if os.path.exists(os.path.join(local_path, month_list[0].split('/')[-1][:6] + ".parquet")):
        continue

    year_folder = os.path.join(local_path, month_list[0].split('/')[-1][:4])
    month_folder = os.path.join(year_folder, month_list[0].split('/')[-1][4:6])

    if not os.path.isdir(year_folder):
        os.mkdir(year_folder)

    if not os.path.isdir(month_folder):
        os.mkdir(month_folder)

    # Download all files from the url list in parallel (threads = no. processors on machine * 5)
    with ThreadPoolExecutor() as executor:
        executor.map(download_file, month_list)

    # Read all csv files of one month into a spark dataframe
    df = spark.read.csv(month_folder, sep='\t', header=False, schema=schema, dateFormat='yyyyMMdd')

    # Write the data of one month into a parquet file
    df.write.parquet(os.path.join(parquet_path, i + ".parquet"), mode='overwrite')
    i += 1

    # Delete the csv files to free up disk space
    shutil.rmtree(month_folder)

In [9]:
# Load all parquet files from the parquet folder into a multiple dataframes. the parquets are in the format 0.parquet, 1.parquet, ...
# Each dataframe should contain 6 more parquets than the previous one so that the first has 6, the second 12, the third 18, ... until all parquets are loaded
df_list = []
parquet_path_list = []
for i in range(0, len(os.listdir(parquet_path)), 6):
    for d in range(0, 6):
        parquet_path_list.append(os.path.join(parquet_path, str(i + d) + ".parquet"))
    df_list.append(spark.read.parquet(*parquet_path_list))

In [ ]:
for df in df_list:
    # Do scaling tests here e.g. call method etc.
    df.count()

In [10]:
from pyspark.sql.functions import broadcast
from pyspark.sql import functions as F

# CSV file containing a mapping from FIPS10-4 country codes to ISO 3166-1 alpha-2 country codes (necessary for superset heatmap)
mapping_file_path = os.path.join(os.getcwd(), 'util', 'country_code_mapping.csv')

# Load mapping file outside of spark (small dataset)
df_mapping = spark.read.csv(mapping_file_path, sep=';', header=True, inferSchema=True)

df_mapping = df_mapping.select(
    F.col('FIPS 10-4'),
    F.col('ISO 3166-1')
)

# Map the country codes
df_non_aggregated = df_base.join(broadcast(df_mapping), df_base['ActionGeo_CountryCode'] == df_mapping['FIPS 10-4'],
                                 'left_outer')

df_non_aggregated = df_non_aggregated \
    .withColumn('ActionGeo_CountryCode', F.col('ISO 3166-1')) \
    .drop('ISO 3166-1') \
    .drop('FIPS 10-4') 

In [11]:
# Cache the dataframe to prevent re-doing data loading & country code mapping
df_non_aggregated.cache()

# Load data & trigger caching
event_count = df_non_aggregated.count()

print("Total number of events:", event_count)

Total number of events: 5915973


In [12]:
import requests
import dateutil.parser

# Replace with your Spark driver node and port
spark_rest_api_url = "http://localhost:4040/api/v1/applications"

# Fetch the list of applications
apps_response = requests.get(spark_rest_api_url)
apps = apps_response.json()

# Assuming you are interested in the first application
app_id = apps[0]['id']

# Get job information for the application
jobs_response = requests.get(f"{spark_rest_api_url}/{app_id}/jobs?status=succeeded")
jobs_data = jobs_response.json()

# Print job information (or process it as needed)
for job in jobs_data:
    if 'submissionTime' in job and 'completionTime' in job and job['completionTime']:
        start = dateutil.parser.parse(job['submissionTime'])
        end = dateutil.parser.parse(job['completionTime'])
        duration = (end - start).total_seconds()
        print(f"Job ID: {job['jobId']}, Status: {job['status']}, Duration: {duration} seconds")
    else:
        print(f"Job ID: {job['jobId']}, Status: {job['status']}, Duration: Not available")
 # 
 # # Get input data size from stages
 #    input_data_size = 0
 #    for stage_id in job['stageIds']:
 #        stage_response = requests.get(f"{spark_rest_api_url}/{app_id}/stages/{stage_id}?status=succeeded")
 #        stage_data = stage_response.json()
 # 
 #        # Summing input data size from all attempts of this stage
 #        input_data_size += stage_data['inputBytes']
 # 
 #    print(f"Total Input Data Size for Job {job['jobId']}: {input_data_size} bytes\n")

Job ID: 6, Status: SUCCEEDED, Duration: 0.147 seconds
Job ID: 5, Status: SUCCEEDED, Duration: 0.424 seconds
Job ID: 4, Status: SUCCEEDED, Duration: 36.843 seconds
Job ID: 3, Status: SUCCEEDED, Duration: 0.881 seconds
Job ID: 2, Status: SUCCEEDED, Duration: 1.467 seconds
Job ID: 1, Status: SUCCEEDED, Duration: 0.437 seconds
Job ID: 0, Status: SUCCEEDED, Duration: 0.815 seconds


In [14]:
app_id

'app-20240114104107-0007'

In [18]:
storage_response = requests.get(f"{spark_rest_api_url}/{app_id}/storage/rdd")
storage_data = jobs_response.json()
for storage in storage_data:
    print(f"Cache use: Memory: {storage['memoryUsed']}, Disk: {storage['diskUsed']}")

Cache use: Memory: 1234299688, Disk: 0
