# Init

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Test") \
     .getOrCreate()

import pyspark.sql.functions as func

# Import Mentions, Clean, and Write to Parquet

In [2]:
from datetime import datetime
start = datetime.now()

df_mentions = spark.read.format("csv") \
    .option("header", "false") \
    .option("delimiter","\t") \
    .option("inferSchema", "true") \
    .load(path=["s3://gdelt-open-data/v2/mentions/*.mentions.csv"]) \
    .select(["_c1","_c4","_c5","_c13"]).distinct()
df_mentions = df_mentions.toDF(*["EventDate","MentionSource","MentionIdentifier","MentionDocTone"])
df_mentions.cache()

df_mentions = df_mentions.na.drop(how="any")

from pyspark.sql.functions import expr, regexp_replace
df_mentions = df_mentions.withColumn("MentionIdentifier", regexp_replace("MentionIdentifier",'http://',""))
df_mentions = df_mentions.withColumn("MentionIdentifier", regexp_replace("MentionIdentifier",'https://',""))
df_mentions = df_mentions.withColumn("MentionIdentifier", regexp_replace("MentionIdentifier",'www\.',""))
df_mentions = df_mentions.withColumn("MentionIdentifier", expr("replace(MentionIdentifier,MentionSource,'')"))
df_mentions = df_mentions.withColumn("EventDate",expr("left(EventDate,8)"))

from pyspark.sql.functions import expr, regexp_replace
df_mentions = df_mentions.withColumn("Month",expr("left(EventDate,6)"))
df_mentions = df_mentions.withColumn("Year",expr("left(EventDate,4)"))

df_mentions = df_mentions.where((df_mentions.Month>=201503)&(df_mentions.Month<201903))

print(df_mentions.count())
print(df_mentions.printSchema())

df_mentions.write.parquet("s3://labadie-gdelt-tradewar/mentions.parquet", mode="overwrite")

print(datetime.now()-start)

319104809
root
 |-- EventDate: string (nullable = true)
 |-- MentionSource: string (nullable = true)
 |-- MentionIdentifier: string (nullable = true)
 |-- MentionDocTone: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Year: string (nullable = true)

None
1:24:16.319871
