In [1]:
import os

local_path = os.path.join(os.getcwd(), 'data')

In [2]:
from datetime import datetime, timedelta

start_date = datetime.strptime('2022-02-24', '%Y-%m-%d')
end_date = datetime.strptime('2022-03-5', '%Y-%m-%d')

# Create a list of dates between start_date and end_date
date_list = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1)]

In [3]:
import urllib.request

# Create a list containing download urls for each date
base_url = 'http://data.gdeltproject.org/gdeltv2/'
urllist = []
index = 0

for date in date_list:
    # Create the url and append it to the list
    for x in range(0, 24):
        for y in range(0, 60, 15):
            datetmp = date + timedelta(hours=x, minutes=y)
            url = base_url + datetmp.strftime('%Y%m%d%H%M%S') + '.export.CSV.zip'
            urllist.append(url)

In [4]:
# Create the local directory if it doesn't exist
if not os.path.isdir(local_path):
    os.mkdir(local_path)

In [5]:
import zipfile


def download_file(url):
    fname = url.split('/')[-1]
    
    # Download file from the specified url, if it doesn't exist yet
    if not os.path.isfile(os.path.join(local_path, fname).replace(".zip", "")):
        try:
            urllib.request.urlretrieve(url, os.path.join(local_path, fname))
            
            # Unzip zip file
            with zipfile.ZipFile(os.path.join(local_path, fname), 'r') as zip_ref:
                zip_ref.extractall(local_path)
                
            # Delete zip file
            os.remove(os.path.join(local_path, fname))
        
        except Exception as e:
            print(f"An error occurred with file {fname}: {e}")
            
    else:
        print('File ' + fname + ' already exists')

In [6]:
from concurrent.futures import ThreadPoolExecutor

# Download all files from the url list in parallel (threads = no. processors on machine * 5)
with ThreadPoolExecutor() as executor:
    executor.map(download_file, urllist)

File 20220224000000.export.CSV.zip already exists
File 20220224001500.export.CSV.zip already exists
File 20220224003000.export.CSV.zip already exists
File 20220224010000.export.CSV.zip already exists
File 20220224004500.export.CSV.zip already exists
File 20220224011500.export.CSV.zip already exists
File 20220224013000.export.CSV.zip already exists
File 20220224014500.export.CSV.zip already exists
File 20220224020000.export.CSV.zip already exists
File 20220224021500.export.CSV.zip already exists
File 20220224023000.export.CSV.zip already exists
File 20220224030000.export.CSV.zip already exists
File 20220224034500.export.CSV.zip already exists
File 20220224031500.export.CSV.zip already exists
File 20220224033000.export.CSV.zip already exists
File 20220224044500.export.CSV.zip already exists
File 20220224043000.export.CSV.zip already exists
File 20220224040000.export.CSV.zip already exists
File 20220224041500.export.CSV.zip already exists
File 20220224051500.export.CSV.zip already exists


In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('Big Data Project') \
    .config("spark.executor.instances", "6") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .enableHiveSupport() \
    .getOrCreate()

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

# Define original data schema for csv files
schema = StructType([
    StructField("GlobalEventID", IntegerType(), True),
    StructField("Day", DateType(), True),
    StructField("MonthYear", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True),
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True),
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True),
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    StructField("DATEADDED", StringType(), True),
    StructField("SOURCEURL", StringType(), True),
])

In [9]:
from pyspark.sql import functions as F

# Load all csv files from the data directory into spark
df = spark.read.csv(local_path, sep='\t', header=False, schema=schema, dateFormat='yyyyMMdd')

# Select only relevant columns
df = df.select(
    F.col('Day').alias('Date'),
    F.col('ActionGeo_CountryCode').alias('CountryCode'),
    F.col('GoldsteinScale')
)

In [10]:
from pyspark.sql.functions import col, isnan, when, count

# Total number of rows
row_count = df.count()

# Number of null values in each column
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Remove rows with any null value
df = df.na.drop()

# Number of rows not considerin null values
row_count_without_null = df.count()

print("Total number of events (before removing null values):", row_count)
print("Events removed from dataset:", row_count - row_count_without_null)

+----+-----------+--------------+
|Date|CountryCode|GoldsteinScale|
+----+-----------+--------------+
|   0|      25275|             3|
+----+-----------+--------------+
Total number of events (before removing null values): 1359153
Events removed from dataset: 25278


In [11]:
# Aggregate the values by date and country so there is only one value per country per day
df = df.groupBy('Date', 'CountryCode').agg(
    F.sum('GoldsteinScale').alias('GoldsteinScaleSum'),
    F.count('*').alias('EventCount')
)

In [12]:
mapping_path = os.path.join(os.getcwd(), 'util', 'country_code_mapping.csv')

# Load mapping file
df_mapping = spark.read.csv(mapping_path, sep=';', header=True, inferSchema=True)

# Map from FIPS10-4 country code to ISO 3166-1 alpha-2 country code
df = df.join(df_mapping, df['CountryCode'] == df_mapping['FIPS 10-4'], 'left_outer')

In [13]:
# Check for country codes where there is no coresponing ISO 3166-1 alpha-2 country code
df.filter(F.col('ISO 3166-1').isNull()) \
    .groupBy('CountryCode') \
    .agg(F.sum('EventCount').alias('EventCount')) \
    .show()

# For example:
# PF (Paracel Islands) -> no equivalent
# NT (Netherlands Antilles) -> no equivalent 
# PG (Spratly Islands) -> no equivalent

# Exchange country code columns
df = df.drop('FIPS 10-4', 'CountryCode') \
    .withColumnRenamed('ISO 3166-1', 'CountryCode')

# Remove rows with no corresponding country code
df = df.na.drop()

+-----------+----------+
|CountryCode|EventCount|
+-----------+----------+
|         PF|         2|
|         YI|        11|
|         NT|         2|
|         PG|         1|
|         RB|       580|
|         TE|        16|
|         OS|      1616|
|         OC|        60|
+-----------+----------+


In [14]:
# Virtual table which can be accessed by the thrift server
df.createOrReplaceTempView("GDELT")

In [15]:
from py4j.java_gateway import java_import

# Retrieve the spark context from the current spark session
sc = spark.sparkContext

# Import the HiveThriftServer2 class using the JVM instance of the spark context
java_import(sc._jvm, "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2")

# Dummy java arguments for main method
java_args = sc._gateway.new_array(sc._gateway.jvm.java.lang.String, 0)

# Start the thrift server by calling the main method of the imported class
sc._jvm.org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.main(java_args)

1333875