In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('dataprep2').getOrCreate()

In [2]:
import os
import random
import urllib.request
from datetime import datetime, timedelta
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

local_path = os.path.join(os.getcwd(), 'data')

In [3]:
start_date = datetime.strptime('2020-01-01', '%Y-%m-%d')
end_date = datetime.strptime('2020-01-31', '%Y-%m-%d')

In [4]:
# Create a list of dates between start_date and end_date
date_list = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days)]

# Create a list of urls for each date
base_url = 'http://data.gdeltproject.org/gdeltv2/'
urllist = []
index = 0

for date in date_list:
    index += 1
    random.seed(1234 + index)
    # get random number between 0 and 23
    hours = random.randint(0,23)
    # get random number between 1 and 4
    minutes = random.randint(0,3)*15
    # format the date
    datetmp = date.replace(hour=hours, minute=minutes)
    # replace result to date_list
    date_list[date_list.index(date)] = datetmp
    
    # create the url and append it to the list
    url = base_url + datetmp.strftime('%Y%m%d%H%M%S') + '.export.CSV.zip'
    urllist.append(url)

In [5]:
# Create the local directory if it doesn't exist
if not os.path.isdir(local_path):
    os.mkdir(local_path)

In [6]:
for url in urllist:
    fname = url.split('/')[-1]
    if not os.path.isfile(os.path.join(local_path, fname)):
        print('Downloading ' + fname)
        urllib.request.urlretrieve(url, os.path.join(local_path, fname))
    else:
        print('File ' + fname + ' already exists')

Downloading 20200101224500.export.CSV.zip
Downloading 20200102174500.export.CSV.zip
Downloading 20200103044500.export.CSV.zip
Downloading 20200104004500.export.CSV.zip
Downloading 20200105023000.export.CSV.zip
Downloading 20200106204500.export.CSV.zip
Downloading 20200107194500.export.CSV.zip
Downloading 20200108023000.export.CSV.zip
Downloading 20200109233000.export.CSV.zip
Downloading 20200110080000.export.CSV.zip
Downloading 20200111171500.export.CSV.zip
Downloading 20200112050000.export.CSV.zip
Downloading 20200113030000.export.CSV.zip
Downloading 20200114181500.export.CSV.zip
Downloading 20200115161500.export.CSV.zip
Downloading 20200116044500.export.CSV.zip
Downloading 20200117220000.export.CSV.zip
Downloading 20200118094500.export.CSV.zip
Downloading 20200119231500.export.CSV.zip
Downloading 20200120131500.export.CSV.zip
Downloading 20200121123000.export.CSV.zip
Downloading 20200122170000.export.CSV.zip
Downloading 20200123164500.export.CSV.zip
Downloading 20200124043000.export.

In [7]:
# Unzip the files and delete the zip files
import zipfile

for date in date_list:
    # date to string
    date = date.strftime('%Y%m%d%H%M%S')
    fname = date + '.export.CSV.zip'
    if os.path.isfile(os.path.join(local_path, fname)):
        print('Unzipping ' + fname)
        with zipfile.ZipFile(os.path.join(local_path, fname), 'r') as zip_ref:
            zip_ref.extractall(local_path)
        os.remove(os.path.join(local_path, fname))
    else:
        print('File ' + fname + ' does not exist')

Unzipping 20200101224500.export.CSV.zip
Unzipping 20200102174500.export.CSV.zip
Unzipping 20200103044500.export.CSV.zip
Unzipping 20200104004500.export.CSV.zip
Unzipping 20200105023000.export.CSV.zip
Unzipping 20200106204500.export.CSV.zip
Unzipping 20200107194500.export.CSV.zip
Unzipping 20200108023000.export.CSV.zip
Unzipping 20200109233000.export.CSV.zip
Unzipping 20200110080000.export.CSV.zip
Unzipping 20200111171500.export.CSV.zip
Unzipping 20200112050000.export.CSV.zip
Unzipping 20200113030000.export.CSV.zip
Unzipping 20200114181500.export.CSV.zip
Unzipping 20200115161500.export.CSV.zip
Unzipping 20200116044500.export.CSV.zip
Unzipping 20200117220000.export.CSV.zip
Unzipping 20200118094500.export.CSV.zip
Unzipping 20200119231500.export.CSV.zip
Unzipping 20200120131500.export.CSV.zip
Unzipping 20200121123000.export.CSV.zip
Unzipping 20200122170000.export.CSV.zip
Unzipping 20200123164500.export.CSV.zip
Unzipping 20200124043000.export.CSV.zip
Unzipping 20200125121500.export.CSV.zip


In [8]:
schema = StructType([
    StructField("GLOBALEVENTID", IntegerType(), True),
    StructField("SQLDATE", IntegerType(), True),
    StructField("MonthYear", IntegerType(), True),
    StructField("Year", IntegerType(), True),
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True),
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True),
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True),
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    StructField("DATEADDED", StringType(), True),
    StructField("SOURCEURL", StringType(), True),
])

In [9]:
df = spark.read.csv(os.path.join(local_path), sep='\t', header=False, schema=schema)
df = df.select('SQLDATE', 'GoldsteinScale', 'AvgTone', 'ActionGeo_CountryCode')

In [10]:
df.show(20)

+--------+--------------+----------+---------------------+
| SQLDATE|GoldsteinScale|   AvgTone|ActionGeo_CountryCode|
+--------+--------------+----------+---------------------+
|20100106|           0.0|-2.3489933|                   RS|
|20100106|           0.0|-2.3489933|                   RS|
|20190104|           1.9|-5.4992766|                   US|
|20190104|           1.9|-5.4992766|                   US|
|20190104|           0.0|-4.4692736|                   CA|
|20190104|          -5.0|-2.7925532|                   US|
|20190104|           2.8|-5.4992766|                   US|
|20190104|           2.8|-5.4992766|                   US|
|20190104|         -10.0| -2.056698|                   CH|
|20190104|         -10.0| -2.056698|                   BM|
|20190104|         -10.0| -2.056698|                   CH|
|20190104|         -10.0| -2.056698|                   BM|
|20190104|          -5.0|-11.176471|                   US|
|20190104|           3.0| 1.2302284|                   U

In [11]:
df.count()

DataFrame[SQLDATE: int, GoldsteinScale: float, AvgTone: float, ActionGeo_CountryCode: string]

In [13]:
df.show(60)

Py4JJavaError: An error occurred while calling o47.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 4 times, most recent failure: Lost task 0.3 in stage 4.0 (TID 34) (172.21.0.10 executor 0): org.apache.spark.SparkFileNotFoundException: File file:/home/jovyan/work/data/20200104004500.export.CSV does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:780)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:220)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.hasNext(InMemoryRelation.scala:119)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anon$2.hasNext(InMemoryRelation.scala:286)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1601)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1528)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1592)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:376)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:326)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkFileNotFoundException: File file:/home/jovyan/work/data/20200104004500.export.CSV does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:780)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:220)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.hasNext(InMemoryRelation.scala:119)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anon$2.hasNext(InMemoryRelation.scala:286)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1601)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1528)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1592)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:376)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:326)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)
