In [1]:
import os

local_path = os.path.join(os.getcwd(), 'data')
parquet_path = os.path.join(local_path, 'parquet_main')

In [2]:
from datetime import datetime, timedelta

# Choose time period for which to download the data
start_date = datetime.strptime('2022-02-24', '%Y-%m-%d')
end_date = datetime.strptime('2022-03-24', '%Y-%m-%d')

# Create a list of dates between start_date and end_date
date_list = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

In [3]:
import urllib.request

# Create a list containing download urls for each date
base_url = 'http://data.gdeltproject.org/gdeltv2/'
url_list = []
index = 0
url_list.append([])
month = date_list[0].month

# Create a nested list containing a list of months with the corresponding download urls
for date in date_list:
    if date.month != month:
        month = date.month
        index += 1
        url_list.append([])

    # Create the url and append it to the month list
    for x in range(0, 24):
        for y in range(0, 60, 15):
            date_tmp = date + timedelta(hours=x, minutes=y)
            url = base_url + date_tmp.strftime('%Y%m%d%H%M%S') + '.export.CSV.zip'
            url_list[index].append(url)

In [4]:
# Create the local directory for the data if it doesn't exist
if not os.path.isdir(local_path):
    os.mkdir(local_path)
    
# Create the local directory for the parquet files if it doesn't exist
if not os.path.isdir(parquet_path):
    os.mkdir(parquet_path)

In [5]:
from pyspark.sql import SparkSession

# Start a spark session (see config folder for spark config)
spark = SparkSession.builder \
    .appName('Big Data Project') \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

# Define original data schema for csv files
schema = StructType([
    StructField("GlobalEventID", IntegerType(), True),
    StructField("Day", DateType(), True),
    StructField("MonthYear", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True),
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True),
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True),
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    StructField("DATEADDED", StringType(), True),
    StructField("SOURCEURL", StringType(), True),
])

In [7]:
import zipfile


def download_file(url):
    fname = url.split('/')[-1]
    folder_location = os.path.join(local_path, fname[:4], fname[4:6])

    # Download file from the specified url, if it doesn't exist yet
    if not os.path.isfile(os.path.join(folder_location, fname).replace(".zip", "")):
        try:
            urllib.request.urlretrieve(url, os.path.join(folder_location, fname))

            # Unzip zip file
            with zipfile.ZipFile(os.path.join(folder_location, fname), 'r') as zip_ref:
                zip_ref.extractall(folder_location)

            # Delete zip file
            os.remove(os.path.join(folder_location, fname))

        except Exception as e:
            print(f"An error occurred with file {fname}: {e}")

    else:
        print('File ' + fname + ' already exists')

In [8]:
import shutil
from concurrent.futures import ThreadPoolExecutor

# Download files and write them to parquet files in parallel for each month
# This is done in batches to allow simple addition of new months to already existing data
for month_list in url_list:
    # Skip month if parquet file already exists
    if os.path.exists(os.path.join(parquet_path, month_list[0].split('/')[-1][:6] + ".parquet")):
        continue

    year_folder = os.path.join(local_path, month_list[0].split('/')[-1][:4])
    month_folder = os.path.join(year_folder, month_list[0].split('/')[-1][4:6])

    if not os.path.isdir(year_folder):
        os.mkdir(year_folder)

    if not os.path.isdir(month_folder):
        os.mkdir(month_folder)

    # Download all files from the url list in parallel (threads = no. processors on machine * 5)
    with ThreadPoolExecutor() as executor:
        executor.map(download_file, month_list)

    # Read all csv files of one month into a spark dataframe
    df = spark.read.csv(month_folder, sep='\t', header=False, schema=schema, dateFormat='yyyyMMdd')

    # Write the data of one month into a parquet file
    df.write.parquet(os.path.join(parquet_path, month_list[0].split('/')[-1][:6] + ".parquet"), mode='overwrite')

    # Delete the csv files to free up disk space
    shutil.rmtree(month_folder)

In [9]:
# Load all parquet files from the data directory into spark
df_base = spark.read.parquet(parquet_path + '/*.parquet')

In [11]:
len(df_base.columns)

61

In [10]:
from pyspark.sql.functions import broadcast
from pyspark.sql import functions as F

# CSV file containing a mapping from FIPS10-4 country codes to ISO 3166-1 alpha-2 country codes (necessary for superset heatmap)
mapping_file_path = os.path.join(os.getcwd(), 'util', 'country_code_mapping.csv')

# Load mapping file outside of spark (small dataset)
df_mapping = spark.read.csv(mapping_file_path, sep=';', header=True, inferSchema=True)

df_mapping = df_mapping.select(
    F.col('FIPS 10-4'),
    F.col('ISO 3166-1')
)

# Map the country codes
df_non_aggregated = df_base.join(broadcast(df_mapping), df_base['ActionGeo_CountryCode'] == df_mapping['FIPS 10-4'],
                                 'left_outer')

df_non_aggregated = df_non_aggregated \
    .withColumn('FIPS 10-4', F.col('ActionGeo_CountryCode')) \
    .withColumn('ActionGeo_CountryCode', F.col('ISO 3166-1')) \
    .drop('ISO 3166-1')

In [11]:
# Cache the dataframe to prevent re-doing data loading & country code mapping
df_non_aggregated.cache()

# Load data & trigger caching
event_count = df_non_aggregated.count()

print("Total number of events:", event_count)

Total number of events: 10361005


In [12]:
# Check for country codes where there is no coresponing ISO 3166-1 alpha-2 country code
df_non_aggregated.filter((F.col('ActionGeo_CountryCode').isNull()) & (F.col('FIPS 10-4').isNotNull())) \
    .groupBy('FIPS 10-4') \
    .agg(F.count('*').alias('EventCount')) \
    .sort('EventCount', ascending=False) \
    .show()

# For example:
# PF (Paracel Islands) -> no equivalent
# NT (Netherlands Antilles) -> no equivalent 
# PG (Spratly Islands) -> no equivalent

+---------+----------+
|FIPS 10-4|EventCount|
+---------+----------+
|       OS|      8935|
|       RB|      4229|
|       OC|       647|
|       YI|       158|
|       PG|        24|
|       JN|        17|
|       TE|        16|
|       NT|        15|
|       PF|        10|
|       WQ|         8|
|       JQ|         7|
|       LQ|         5|
|       KQ|         5|
|       HQ|         3|
+---------+----------+


In [13]:
# Select only relevant columns for the aggregation
df_selection = df_non_aggregated.select(
    F.col('Day'),
    F.col('ActionGeo_CountryCode'),
    F.col('GoldsteinScale')
)

In [14]:
# Number of null values in each column
df_selection.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df_selection.columns]).show()

+---+---------------------+--------------+
|Day|ActionGeo_CountryCode|GoldsteinScale|
+---+---------------------+--------------+
|  0|               254027|            48|
+---+---------------------+--------------+


In [15]:
# Remove rows that contain null values, which would distort the aggregation results  
df_selection = df_selection.na.drop()

# Number of rows not considering null values
event_count_without_null = df_selection.count()

print("Events removed from dataset:", event_count - event_count_without_null)

Events removed from dataset: 254075


In [16]:
# Aggregate the values by date and country so there is only one value per country per day
df_aggregated = df_selection.groupBy('Day', 'ActionGeo_CountryCode').agg(
    F.sum('GoldsteinScale').alias('GoldsteinScaleSum'),
    F.count('*').alias('EventCount')
)

In [17]:
# Cache the dataframe to prevent re-doing the aggregation
df_aggregated.cache()

# Trigger caching of the final aggregated dataframe
aggregation_count = df_aggregated.count()

print("Number of rows in aggregation:", aggregation_count)

Number of rows in aggregation: 28659


In [18]:
# Virtual table which can be accessed by the thrift server
df_non_aggregated.createOrReplaceGlobalTempView("GDELT")
df_aggregated.createOrReplaceGlobalTempView("GDELT_AGGR")

In [19]:
from py4j.java_gateway import java_import

# Retrieve the spark context from the current spark session
sc = spark.sparkContext

# Import the HiveThriftServer2 class using the JVM instance of the spark context
java_import(sc._jvm, "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2")

# Dummy java arguments for main method
java_args = sc._gateway.new_array(sc._gateway.jvm.java.lang.String, 0)

# Start the thrift server by calling the main method of the imported class
sc._jvm.org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.main(java_args)