In [1]:
import os

local_path = os.path.join(os.getcwd(), 'data')

In [2]:
from datetime import datetime, timedelta
start_date = datetime.strptime('2022-02-24', '%Y-%m-%d')
end_date = datetime.strptime('2022-03-5', '%Y-%m-%d')

# Create a list of dates between start_date and end_date
date_list = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days + 1)]

In [3]:
import urllib.request

# Create a list containing download urls for each date
base_url = 'http://data.gdeltproject.org/gdeltv2/'
urllist = []
index = 0

for date in date_list:
    # Create the url and append it to the list
    for x in range(0, 24):
        for y in range(0, 60, 15):
            datetmp = date + timedelta(hours=x, minutes=y)
            url = base_url + datetmp.strftime('%Y%m%d%H%M%S') + '.export.CSV.zip'
            urllist.append(url)

In [4]:
# Create the local directory if it doesn't exist
if not os.path.isdir(local_path):
    os.mkdir(local_path)

In [5]:
import zipfile

def download_file(url):
    fname = url.split('/')[-1]
    
    # Download file from the specified url, if it doesn't exist yet
    if not os.path.isfile(os.path.join(local_path, fname).replace(".zip", "")):
        try:
            urllib.request.urlretrieve(url, os.path.join(local_path, fname))
            
            # Unzip zip file
            with zipfile.ZipFile(os.path.join(local_path, fname), 'r') as zip_ref:
                zip_ref.extractall(local_path)
                
            # Delete zip file
            os.remove(os.path.join(local_path, fname))
        
        except Exception as e:
            print(f"An error occurred with file {fname}: {e}")
            
    else:
        print('File ' + fname + ' already exists')

In [6]:
from concurrent.futures import ThreadPoolExecutor

# Download all files from the url list in parallel (threads = no. processors on machine * 5)
with ThreadPoolExecutor() as executor:
    executor.map(download_file, urllist)

File 20220224000000.export.CSV.zip already exists
File 20220224001500.export.CSV.zip already exists
File 20220224003000.export.CSV.zip already exists
File 20220224004500.export.CSV.zip already exists
File 20220224010000.export.CSV.zip already exists
File 20220224011500.export.CSV.zip already exists
File 20220224013000.export.CSV.zip already exists
File 20220224021500.export.CSV.zip already exists
File 20220224014500.export.CSV.zip already exists
File 20220224023000.export.CSV.zip already exists
File 20220224030000.export.CSV.zip already exists
File 20220224020000.export.CSV.zip already exists
File 20220224040000.export.CSV.zip already exists
File 20220224033000.export.CSV.zip already exists
File 20220224034500.export.CSV.zip already exists
File 20220224031500.export.CSV.zip already exists
File 20220224024500.export.CSV.zip already exists
File 20220224043000.export.CSV.zip already exists
File 20220224051500.export.CSV.zip already exists
File 20220224044500.export.CSV.zip already exists


In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('Big Data Project') \
    .config("spark.executor.instances", "6") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "1") \
    .enableHiveSupport() \
    .getOrCreate()

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

# Define Schema for csv files
schema = StructType([
    StructField("GLOBALEVENTID", IntegerType(), True),
    StructField("SQLDATE", DateType(), True),
    StructField("MonthYear", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True),
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True),
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True),
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    StructField("DATEADDED", StringType(), True),
    StructField("SOURCEURL", StringType(), True),
])

In [9]:
# Load all csv files from the data directory into spark
df = spark.read.csv(local_path, sep='\t', header=False, schema=schema, dateFormat='yyyyMMdd')
df = df.select('SQLDATE', 'GoldsteinScale', 'AvgTone', 'ActionGeo_CountryCode')

In [10]:
# TODO: See how many values are null per column

row_count = df.count()

# Remove rows with null values and show how many values were null per column
df = df.na.drop()

row_count_without_null = df.count()

# Number fo rows with null values
print("Rows with null values:" ,row_count-row_count_without_null)

Rows with null values: 25278


In [11]:
from pyspark.sql import functions as F

# Aggregate the values by date and country so there is only one value per country per day
df = df.groupBy('SQLDATE', 'ActionGeo_CountryCode').agg(
    F.sum('GoldsteinScale').alias('sum_GoldsteinScale'),
    F.sum('AvgTone').alias('sum_AvgTone'),
    F.count('*').alias('count_Events')
)

df.show()

+----------+---------------------+-------------------+-------------------+------------+
|   SQLDATE|ActionGeo_CountryCode| sum_GoldsteinScale|        sum_AvgTone|count_Events|
+----------+---------------------+-------------------+-------------------+------------+
|2022-02-25|                   UP| -5002.199915587902| -88238.59375283867|       23909|
|2021-03-04|                   UP|  63.60000044107437|-172.10480469465256|          45|
|2022-02-25|                   OD| 42.900000900030136|-164.72478967905045|          79|
|2022-03-05|                   CD|-138.59999930858612|-239.16234266757965|          41|
|2022-02-24|                   FO|  8.699999928474426|  20.95740818977356|          10|
|2022-02-24|                   IT|  717.5000024586916|  -853.251463636756|         731|
|2022-03-03|                   DA| 106.49999970197678| -80.69260028749704|         200|
|2022-02-24|                   MV| 28.199999570846558| -33.85475397109985|          20|
|2022-02-25|                   D

In [12]:
from pyspark import StorageLevel

# TODO: Find country codes without counterpart

mapping_path = os.path.join(os.getcwd(), 'util', 'country_code_mapping.csv')

# Load mapping file
df_mapping = spark.read.csv(mapping_path, sep=';', header=True, inferSchema=True)

# Map from FIPS10-4 country code to ISO 3166-1 alpha-2 country code
df = df.join(df_mapping, df['ActionGeo_CountryCode'] == df_mapping['FIPS 10-4'], 'left_outer').drop('FIPS 10-4', 'ActionGeo_CountryCode')

# Rename column
df = df.withColumnRenamed('ISO 3166-1', 'ActionGeo_CountryCode')

df = df.na.drop()
df.show()

+----------+--------------------+--------------------+------------+--------------------+---------------------+
|   SQLDATE|  sum_GoldsteinScale|         sum_AvgTone|count_Events|         CountryName|ActionGeo_CountryCode|
+----------+--------------------+--------------------+------------+--------------------+---------------------+
|2022-02-24|   717.5000024586916|   -853.251463636756|         731|               Italy|                   IT|
|2021-03-04|   63.60000044107437| -172.10480469465256|          45|             Ukraine|                   UA|
|2022-02-25|  -5002.199915587902|  -88238.59375283867|       23909|             Ukraine|                   UA|
|2022-02-24| -14.499999195337296|  -140.0104282796383|          46|            Slovakia|                   SK|
|2022-03-03|  106.49999970197678|  -80.69260028749704|         200|             Denmark|                   DK|
|2022-03-01|   7397.800024934113| -12047.977323267609|        7319|United Kingdom of...|                   GB|
|

In [13]:
# from pyspark.sql.functions import col
# null_values_df = df.filter(col('ISO 3166-1').isNull())
# 
# null_values_df.select(['ActionGeo_CountryCode']).distinct().show()

# from pyspark.sql.functions import col,isnan, when, count
# df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]
#    ).show()

# WE & GZ (West Bank & Gaza Strip) -> PS (Palestine)
# PF (Paracel Islands) -> no equivalent
# NT (Netherlands Antilles) -> no equivalent 
# PG (Spratly Islands) -> no equivalent
# 

In [14]:
# Virtual table which can be accessed by the thrift server
df.createOrReplaceTempView("GDELT")

In [15]:
from py4j.java_gateway import java_import

# Retrieve the spark context from the current spark session
sc = spark.sparkContext

# Import the HiveThriftServer2 class using the JVM instance of the spark context
java_import(sc._jvm, "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2")

# Dummy java arguments for main method
java_args = sc._gateway.new_array(sc._gateway.jvm.java.lang.String, 0)

# Start the thrift server by calling the main method of the imported class
sc._jvm.org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.main(java_args)