In [1]:
import os
import random
import urllib.request

local_path = os.path.join(os.getcwd(), 'data')

In [2]:
from datetime import datetime, timedelta
start_date = datetime.strptime('2023-01-01', '%Y-%m-%d')
end_date = datetime.strptime('2023-01-31', '%Y-%m-%d')

# Create a list of dates between start_date and end_date
date_list = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days + 1)]

In [3]:
# Create a list containing download urls for each date
base_url = 'http://data.gdeltproject.org/gdeltv2/'
urllist = []
index = 0

for date in date_list:
    index += 1
    # Set seed to get the same results with every execution
    random.seed(1234 + index)
    
    # Get random number between 0 and 23
    hours = random.randint(0,23)
    
    # Get random number between 1 and 4
    minutes = random.randint(0,3)*15
    
    # Format the date
    datetmp = date.replace(hour=hours, minute=minutes)
    
    # Replace result to date_list
    date_list[date_list.index(date)] = datetmp
    
    # Create the url and append it to the list
    url = base_url + datetmp.strftime('%Y%m%d%H%M%S') + '.export.CSV.zip'
    urllist.append(url)

In [4]:
# Create the local directory if it doesn't exist
if not os.path.isdir(local_path):
    os.mkdir(local_path)

In [5]:
import zipfile

def download_file(url):
    fname = url.split('/')[-1]
    
    # Download file from the specified url, if it doesn't exist yet
    if not os.path.isfile(os.path.join(local_path, fname).replace(".zip", "")):
        try:
            urllib.request.urlretrieve(url, os.path.join(local_path, fname))
            
            # Unzip zip file
            with zipfile.ZipFile(os.path.join(local_path, fname), 'r') as zip_ref:
                zip_ref.extractall(local_path)
                
            # Delete zip file
            os.remove(os.path.join(local_path, fname))
        
        except Exception as e:
            print(f"An error occurred: {e}")
            
    else:
        print('File ' + fname + ' already exists')

In [6]:
from concurrent.futures import ThreadPoolExecutor

# Download all files from the url list in parallel (threads = no. processors on machine * 5)
with ThreadPoolExecutor() as executor:
    executor.map(download_file, urllist)

File 20230101224500.export.CSV.zip already exists
File 20230102174500.export.CSV.zip already exists
File 20230103044500.export.CSV.zip already exists
File 20230104004500.export.CSV.zip already exists
File 20230105023000.export.CSV.zip already exists
File 20230106204500.export.CSV.zip already exists
File 20230109233000.export.CSV.zip already exists
File 20230108023000.export.CSV.zip already exists
File 20230107194500.export.CSV.zip already exists
File 20230112050000.export.CSV.zip already exists
File 20230111171500.export.CSV.zip already exists
File 20230113030000.export.CSV.zip already exists
File 20230110080000.export.CSV.zip already exists
File 20230114181500.export.CSV.zip already exists
File 20230115161500.export.CSV.zip already exists
File 20230118094500.export.CSV.zip already exists
File 20230116044500.export.CSV.zip already exists
File 20230117220000.export.CSV.zip already exists
File 20230119231500.export.CSV.zip already exists
File 20230120131500.export.CSV.zip already exists


In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('Big Data Project') \
    .config("spark.cores.max", "4") \
    .enableHiveSupport() \
    .getOrCreate()

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

# Define Schema for csv files
schema = StructType([
    StructField("GLOBALEVENTID", IntegerType(), True),
    StructField("SQLDATE", DateType(), True),
    StructField("MonthYear", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True),
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True),
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True),
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    StructField("DATEADDED", StringType(), True),
    StructField("SOURCEURL", StringType(), True),
])

In [9]:
df = spark.read.csv(local_path, sep='\t', header=False, schema=schema, dateFormat='yyyyMMdd')
df = df.select('SQLDATE', 'GoldsteinScale', 'AvgTone', 'ActionGeo_CountryCode')

In [10]:
# TODO: Find country codes without counterpart

mapping_path = os.path.join(os.getcwd(), 'util', 'country_code_mapping.csv')

# Load mapping file
df_mapping = spark.read.csv(mapping_path, sep=';', header=True, inferSchema=True)

# Map from FIPS10-4 country code to ISO 3166-1 alpha-2 country code
df = df.join(df_mapping, df['ActionGeo_CountryCode'] == df_mapping['FIPS 10-4'], 'left_outer').drop('FIPS 10-4', 'ActionGeo_CountryCode')

# Rename column
df = df.withColumnRenamed('ISO 3166-1', 'ActionGeo_CountryCode')

In [11]:
# TODO: See how many values are null per column

row_count = df.count()

# Remove rows with null values and show how many values were null per column
df = df.na.drop()

row_count_without_null = df.count()

# Number fo rows with null values
print("Rows with null values:" ,row_count-row_count_without_null)

Rows with null values: 1109


In [12]:
# Virtual table which can be accessed by the thrift server
df.createOrReplaceTempView("GDELT")

In [13]:
from py4j.java_gateway import java_import

# Retrieve the spark context from the current spark session
sc = spark.sparkContext

# Import the HiveThriftServer2 class using the JVM instance of the spark context
java_import(sc._jvm, "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2")

# Dummy java arguments for main method
java_args = sc._gateway.new_array(sc._gateway.jvm.java.lang.String, 0)

# Start the thrift server by calling the main method of the imported class
sc._jvm.org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.main(java_args)