# Init

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Test") \
     .getOrCreate()

In [2]:
import pandas as pd
pd.options.display.max_rows=250
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
plt.style.use('seaborn')
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter, FuncFormatter

import pyspark.sql.functions as func
from pyspark.sql.functions import expr, regexp_replace

# Dataset Imports

In [3]:
start = datetime.now()

df_mentions = spark.read.parquet("s3://labadie-gdelt-tradewar/mentions.parquet")
df_mentions = df_mentions.withColumn("MentionDocTone",func.col("MentionDocTone").cast("float"))
df_mentions.cache()

print(df_mentions.count())

print(df_mentions.printSchema())

print(datetime.now()-start)

319104809
root
 |-- EventDate: string (nullable = true)
 |-- MentionSource: string (nullable = true)
 |-- MentionIdentifier: string (nullable = true)
 |-- MentionDocTone: float (nullable = true)
 |-- Month: string (nullable = true)
 |-- Year: string (nullable = true)

None
0:03:31.615381


In [4]:
df_mentions.show()

+---------+--------------------+--------------------+--------------+------+----+
|EventDate|       MentionSource|   MentionIdentifier|MentionDocTone| Month|Year|
+---------+--------------------+--------------------+--------------+------+----+
| 20150306|          hitfix.com|/motion-captured/...|     -2.728513|201503|2015|
| 20150306|          fijione.tv|/isis-claims-amer...|    -14.444445|201503|2015|
| 20150306| billingsgazette.com|/news/state-and-r...|     -10.36036|201503|2015|
| 20150306|          news24.com|/SouthAfrica/News...|     -6.553398|201503|2015|
| 20150306|         cnsnews.com|/news/article/col...|     -2.857143|201503|2015|
| 20150306|         recorder.ca|/2015/03/06/boy-1...|     -9.756098|201503|2015|
| 20150306|   yorknewstimes.com|/news/authorities...|      6.451613|201503|2015|
| 20150306|            kmbc.com|/news/ferguson-le...|    -1.8181819|201503|2015|
| 20150306|          kagstv.com|/News/KAGSNews/ID...|     0.4796163|201503|2015|
| 20150306|          mondaq.

# EDA Across Fields
We could show how many countries are represented by the dataset.  Might just be interesting to show the breadth of the data.

#### Num Sources Total

In [5]:
print("Num Unique Publishers",df_mentions.select(["MentionSource"]).distinct().count())

Num Unique Publishers 108411


#### Publishers That Published at Least Once a Month and Mentioned Trump

In [6]:
# find sources that published at least once a month
mentions_by_month=df_mentions.groupby(["MentionSource","Month"]).count()
max_months = mentions_by_month.groupby("MentionSource").count().agg(func.max("count").alias("max_count")).take(1)[0].max_count
sources_all_months=mentions_by_month.groupby("MentionSource").count().where(func.col("count")==max_months). \
    selectExpr("MentionSource as x")
sources_all_months.cache()

# find sources who had a URL that included trump at least once
also_mentioned_trump=df_mentions.join(sources_all_months, df_mentions.MentionSource==sources_all_months.x).drop("x")
also_mentioned_trump=also_mentioned_trump.where(df_mentions.MentionIdentifier.rlike('trump')). \
    selectExpr("MentionSource as y").distinct()
also_mentioned_trump.cache()

# build a filtered dataset
filtered_mentions=df_mentions.join(also_mentioned_trump,df_mentions.MentionSource==also_mentioned_trump.y).drop("y")
filtered_mentions.cache()

DataFrame[EventDate: string, MentionSource: string, MentionIdentifier: string, MentionDocTone: float, Month: string, Year: string]

In [7]:
print("Num Publishers At Least Once a Month", sources_all_months.count())
print("Num Publishers At Least Once a Month and Mentioned Trump", also_mentioned_trump.count())
print("    Num Articles",filtered_mentions.count())

Num Publishers At Least Once a Month 6883
Num Publishers At Least Once a Month and Mentioned Trump 5726
    Num Articles 226989999


In [8]:
print("Num Articles with Zero Tone:",filtered_mentions.where(filtered_mentions.MentionDocTone==0).count())

Num Articles with Zero Tone: 5172066


In [9]:
print("Publishers:",
      filtered_mentions.where(filtered_mentions.MentionDocTone!=0).select("MentionSource").distinct().count())
print("Articles:",filtered_mentions.where(filtered_mentions.MentionDocTone!=0).count())

Publishers: 5726
Articles: 221817927


#### Store Filtered Dataset in S3

In [10]:
filtered_mentions=filtered_mentions.where(filtered_mentions.MentionDocTone!=0)
filtered_mentions.write.parquet("s3://labadie-gdelt-tradewar/filtered_mentions.parquet", mode="overwrite")

#### Stats on That Data

In [13]:
filtered_mentions.orderBy("MentionDocTone",ascending=False).take(10)

[Row(EventDate='20170221', MentionSource='iheart.com', MentionIdentifier='kmag991./onair/maverick-48550/the-new-people-of-walmart-pics-15581180/', MentionDocTone=58.16023635864258, Month='201702', Year='2017'),
 Row(EventDate='20181114', MentionSource='dailystar.co.uk', MentionIdentifier='/news/latest-news/742410/Prince-Charles-birthday-Prince-of-Wales-70th-birthday-party-who-will-attend', MentionDocTone=36.3636360168457, Month='201811', Year='2018'),
 Row(EventDate='20180820', MentionSource='fltimes.com', MentionIdentifier='/briefs/midlakes-middle-school-hosts-awards-ceremony/article_9625d27a-3b7b-5a7a-a5b5-195d2fc959d2.html', MentionDocTone=33.50983428955078, Month='201808', Year='2018'),
 Row(EventDate='20180820', MentionSource='fltimes.com', MentionIdentifier='/briefs/midlakes-middle-school-hosts-awards-ceremony/article_9625d27a-3b7b-5a7a-a5b5-195d2fc959d2.html', MentionDocTone=33.50983428955078, Month='201808', Year='2018'),
 Row(EventDate='20170306', MentionSource='ghanaweb.com',

In [28]:
print("Avg Tone",filtered_mentions.where(filtered_mentions.MentionDocTone!=0). \
      select(filtered_mentions.MentionDocTone.cast('float')).agg(func.mean("MentionDocTone")).show())
print("Min Tone",filtered_mentions.where(filtered_mentions.MentionDocTone!=0). \
      select(filtered_mentions.MentionDocTone.cast('float')).agg(func.min("MentionDocTone")).show())
print("Max Tone",filtered_mentions.where(filtered_mentions.MentionDocTone!=0). \
      select(filtered_mentions.MentionDocTone.cast('float')).agg(func.max("MentionDocTone")).show())

+-------------------+
|avg(MentionDocTone)|
+-------------------+
|-3.4193885499365715|
+-------------------+

Avg Tone None
+-------------------+
|min(MentionDocTone)|
+-------------------+
|          -77.89855|
+-------------------+

Min Tone None
+-------------------+
|max(MentionDocTone)|
+-------------------+
|          58.160236|
+-------------------+

Max Tone None


# Basic Plots

### Plotting functions

In [38]:
def plot_line_over_time(x,y_dict,y_axis_label,filename):
    fig, ax = plt.subplots(figsize=(10,5))

    data_to_plot = list(y_dict.items())
    
    # first set of data
    ax.plot(x,data_to_plot[0][1],label=data_to_plot[0][0])
    ax.set_xlabel("Week of Year")
    ax.tick_params(axis="x", rotation=30)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(4))
    ax.set_ylabel(y_axis_label)
    ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))

    plt.legend()
    plt.tight_layout()
    plt.savefig("Plots/"+filename+".png", 
                facecolor=fig.get_facecolor(), 
                edgecolor='none')
    
    
def plot_line_over_time_two_y_axis(x,y_dict,filename):
    fig, ax = plt.subplots(figsize=(10,5))

    data_to_plot = list(y_dict.items())
    
    # first set of data
    ax.plot(x,data_to_plot[0][1],label=data_to_plot[0][0])
    ax.set_xlabel("Week of Year")
    ax.tick_params(axis="x", rotation=30)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(4))
    ax.set_ylabel(data_to_plot[0][0])
    ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
    
    if len(data_to_plot) > 1:
        for i in range(1,len(data_to_plot)):
            ax2 = ax.twinx()  # instantiate a second axes that shares the same x-axis
            ax2.xaxis.set_major_locator(ticker.MultipleLocator(4))
            ax2.set_ylabel(data_to_plot[i][0])
            ax2.plot(x,data_to_plot[i][1],data_to_plot[i][1])      

    plt.tight_layout()
    plt.savefig("Plots/"+filename+".png", 
                facecolor=fig.get_facecolor(), 
                edgecolor='none')
    
    
def counts_by_week_year(relevant_events):
    # group by day
    all_events_by_day = pd.DataFrame(relevant_events.groupby("EventDate").count().collect())
    all_events_by_day.columns = ["Date","Count"]
    all_events_by_day["Date"] = pd.to_datetime(all_events_by_day["Date"].astype(str), format='%Y%m%d', errors="coerce")
    all_events_by_day = all_events_by_day[all_events_by_day["Date"] >= datetime.strptime("2015-04-01","%Y-%m-%d")]
    all_events_by_day = all_events_by_day.sort_values(by="Date")

    # get week number of year
    all_events_by_day["Week"] = ((all_events_by_day["Date"].dt.dayofyear-1)//7+1).apply(lambda x: '{0:0>2}'.format(min(x,52)))
    all_events_by_day["Year"] = all_events_by_day["Date"].dt.year.astype(str)
    all_events_by_day["YearWeek"] = all_events_by_day["Year"] + "-" + all_events_by_day["Week"]
    all_events_by_day = all_events_by_day.groupby(by="YearWeek")["Count"].sum().reset_index()

    return all_events_by_day


def avg_tone_by_week_year(relevant_events):
    all_events_by_day = pd.DataFrame(relevant_events.groupby("EventDate").agg(func.mean("MentionDocTone").alias("MentionDocTone")).collect())
    all_events_by_day.columns = ["Date","MentionDocTone"]
    all_events_by_day["Date"] = pd.to_datetime(all_events_by_day["Date"].astype(str), format='%Y%m%d', errors="coerce")
    all_events_by_day = all_events_by_day[all_events_by_day["Date"] >= datetime.strptime("2015-04-01","%Y-%m-%d")]
    all_events_by_day = all_events_by_day.sort_values(by="Date")

    # get week number of year
    all_events_by_day["Week"] = ((all_events_by_day["Date"].dt.dayofyear-1)//7+1).apply(lambda x: '{0:0>2}'.format(min(x,52)))
    all_events_by_day["Year"] = all_events_by_day["Date"].dt.year.astype(str)
    all_events_by_day["YearWeek"] = all_events_by_day["Year"] + "-" + all_events_by_day["Week"]
    all_events_by_day = all_events_by_day.groupby(by="YearWeek")["MentionDocTone"].mean().reset_index()

    return all_events_by_day


### Plot Count of All Events by Day

In [39]:
start = datetime.now()

# group by week_year
data = counts_by_week_year(df_mentions)

# group by week and plot
x=data["YearWeek"]
y_dict = {}
y_dict["All Event Count"] = data["Count"]
plot_line_over_time(x,y_dict,"Count","All_Events_Counts")

print(datetime.now()-start)

KeyboardInterrupt: 

### Plot trump mentions

In [None]:
# get relevant events
relevant_events = df_mentions.where(df_mentions.MentionIdentifier.rlike('trump'))

# group by week_year
counts = counts_by_week_year(relevant_events)
tone = avg_tone_by_week_year(relevant_events)

# plot
x=counts["YearWeek"]
y_dict = {}
y_dict["Count"] = counts["Count"]
y_dict["Tone"]=tone["MentionDocTone"]

plot_line_over_time_two_y_axis(x,y_dict,"Trump_Events_Counts")

### Plot tariff mentions

In [None]:
# get relevant events
relevant_events = df_mentions.where(df_mentions.MentionIdentifier.rlike('tariff')
                                  | df_mentions.MentionIdentifier.rlike('trade*war'))

# group by week_year
data = counts_by_week_year(relevant_events)

# group by week and plot
x=data["YearWeek"]
y_dict = {}
y_dict["Count"] = data["Count"]
plot_line_over_time(x,y_dict,"Count","Tariff_Events_Counts")