In [1]:
import findspark

findspark.init()

from pyspark.sql import SparkSession
    
spark = SparkSession.builder \
    .appName('dataprep') \
    .getOrCreate()

In [15]:
# Download the data from the GDELT project. Download just the data from the last 15 days. Make the 15 days a parameter.

import os
import urllib.request
from datetime import datetime, timedelta
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


In [3]:
# Define the parameters
# Define the number of days to download
num_days = 1

# Define the base URL fro the GDELT project archives
base_url = 'http://data.gdeltproject.org/gdeltv2/'

# Define the local directory where you want to store the data files
local_path = os.path.join(os.getcwd(), 'data')


In [4]:
# Create the local directory if it doesn't exist
if not os.path.isdir(local_path):
    os.mkdir(local_path)


In [5]:
# Create the list of days for which to download the data
# add hours minutes and seconds to the day but seconds are always zero and minutes are counted in 15 minute intervals

# Get the current date and time
current_time = datetime.now()

# Calculate the total minutes from the start of the day
total_minutes = current_time.hour * 60 + current_time.minute

# Calculate the rounded down minutes
rounded_minutes = (total_minutes // 15) * 15

# Calculate the new hour and minute values
new_hour = rounded_minutes // 60
new_minute = rounded_minutes % 60

# Create a new datetime object with the new hour and minute values
rounded_time = datetime(current_time.year, current_time.month, current_time.day, new_hour, new_minute)

# Get the current date and time
current_time = rounded_time

# Calculate the start date
start_date = current_time - timedelta(days=num_days)

# Initialize the current date and time to the start date
current_time = start_date

# Initialize an empty list to store the date strings
date_strings = []

# Generate date strings in 15-minute intervals
while current_time <= datetime.now():
    # Format the current date and time as a string
    date_string = current_time.strftime('%Y%m%d%H%M') + '00'

    # Add the date string to the list
    date_strings.append(date_string)

    # Increment the current date and time by 15 minutes
    current_time += timedelta(minutes=15)


In [6]:
# Create the list of URLs for the daily file lists
urls = []
for date in date_strings:
    url = base_url + date + '.export.CSV.zip'
    urls.append(url)


In [7]:
# Download the files from the urls and save them to the local directory data
for url in urls:
    fname = url.split('/')[-1]
    if not os.path.isfile(os.path.join(local_path, fname)):
        print('Downloading ' + fname)
        urllib.request.urlretrieve(url, os.path.join(local_path, fname))
    else:
        print('File ' + fname + ' already exists')



Downloading 20231218183000.export.CSV.zip
Downloading 20231218184500.export.CSV.zip
Downloading 20231218190000.export.CSV.zip
Downloading 20231218191500.export.CSV.zip
Downloading 20231218193000.export.CSV.zip
Downloading 20231218194500.export.CSV.zip
Downloading 20231218200000.export.CSV.zip
Downloading 20231218201500.export.CSV.zip
Downloading 20231218203000.export.CSV.zip
Downloading 20231218204500.export.CSV.zip
Downloading 20231218210000.export.CSV.zip
Downloading 20231218211500.export.CSV.zip
Downloading 20231218213000.export.CSV.zip
Downloading 20231218214500.export.CSV.zip
Downloading 20231218220000.export.CSV.zip
Downloading 20231218221500.export.CSV.zip
Downloading 20231218223000.export.CSV.zip
Downloading 20231218224500.export.CSV.zip
Downloading 20231218230000.export.CSV.zip
Downloading 20231218231500.export.CSV.zip
Downloading 20231218233000.export.CSV.zip
Downloading 20231218234500.export.CSV.zip
Downloading 20231219000000.export.CSV.zip
Downloading 20231219001500.export.

In [8]:
# Unzip the files and delete the zip files
import zipfile

for date in date_strings:
    fname = date + '.export.CSV.zip'
    if os.path.isfile(os.path.join(local_path, fname)):
        print('Unzipping ' + fname)
        with zipfile.ZipFile(os.path.join(local_path, fname), 'r') as zip_ref:
            zip_ref.extractall(local_path)
        os.remove(os.path.join(local_path, fname))
    else:
        print('File ' + fname + ' does not exist')


Unzipping 20231218183000.export.CSV.zip
Unzipping 20231218184500.export.CSV.zip
Unzipping 20231218190000.export.CSV.zip
Unzipping 20231218191500.export.CSV.zip
Unzipping 20231218193000.export.CSV.zip
Unzipping 20231218194500.export.CSV.zip
Unzipping 20231218200000.export.CSV.zip
Unzipping 20231218201500.export.CSV.zip
Unzipping 20231218203000.export.CSV.zip
Unzipping 20231218204500.export.CSV.zip
Unzipping 20231218210000.export.CSV.zip
Unzipping 20231218211500.export.CSV.zip
Unzipping 20231218213000.export.CSV.zip
Unzipping 20231218214500.export.CSV.zip
Unzipping 20231218220000.export.CSV.zip
Unzipping 20231218221500.export.CSV.zip
Unzipping 20231218223000.export.CSV.zip
Unzipping 20231218224500.export.CSV.zip
Unzipping 20231218230000.export.CSV.zip
Unzipping 20231218231500.export.CSV.zip
Unzipping 20231218233000.export.CSV.zip
Unzipping 20231218234500.export.CSV.zip
Unzipping 20231219000000.export.CSV.zip
Unzipping 20231219001500.export.CSV.zip
Unzipping 20231219003000.export.CSV.zip


In [9]:
# Create the list of daily data files
files = []
for date in date_strings:
    fname = date + '.export.CSV'
    if os.path.isfile(os.path.join(local_path, fname)):
        files.append(fname)

In [36]:
schema = StructType([
    StructField("GLOBALEVENTID", IntegerType(), True),
    StructField("SQLDATE", IntegerType(), True),
    StructField("MonthYear", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True),
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True),
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True),
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    StructField("DATEADDED", StringType(), True),
    StructField("SOURCEURL", StringType(), True),
])



In [37]:
# Create the dataframe
df = spark.read.csv(os.path.join(local_path, files[0]), sep='\t', header=False, schema=schema)
#columns_to_keep = ['column1', 'column2', 'column3']
#df = df.select(*columns_to_keep)


In [None]:
# Append the data from the remaining files to the dataframe
for file in files[1:]:
    dftmp = spark.read.csv(os.path.join(local_path, file), sep='\t', header=False, schema=schema)
    #columns_to_keep = ['column1', 'column2', 'column3']
    #dftmp = dftmp.select(*columns_to_keep)
    df = df.union(dftmp)


In [None]:
# Save the dataframe to a CSV file
#df.write.csv('gdelt.csv', header=True)


In [38]:
df.show(20)

+-------------+--------+---------+----+------------+----------+------------------+-----------------+--------------------+----------------+-------------------+-------------------+---------------+---------------+---------------+----------+------------------+-----------------+--------------------+----------------+-------------------+-------------------+---------------+---------------+---------------+-----------+---------+-------------+-------------+---------+--------------+-----------+----------+-----------+-----------+--------------+--------------------+---------------------+------------------+------------------+-------------+--------------+-------------------+--------------+--------------------+---------------------+------------------+------------------+-------------+--------------+-------------------+--------------+--------------------+---------------------+------------------+------------------+-------------+--------------+-------------------+--------------+--------------------+
|GLOBA