# Init

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Test") \
     .getOrCreate()

In [84]:
import pandas as pd
pd.options.display.max_rows=250
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
plt.style.use('seaborn')
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter, FuncFormatter

import pyspark.sql.functions as func
from pyspark.sql.functions import expr, regexp_replace

# Dataset Imports

In [3]:
start = datetime.now()

df_mentions = spark.read.parquet("s3://labadie-gdelt-tradewar/mentions.parquet")

df_mentions.cache()

print(df_mentions.count())

print(df_mentions.printSchema())

print(datetime.now()-start)

329927031
root
 |-- EventDate: string (nullable = true)
 |-- MentionSource: string (nullable = true)
 |-- MentionIdentifier: string (nullable = true)
 |-- MentionDocTone: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Year: string (nullable = true)

None
0:03:45.480535


In [4]:
df_mentions.show()

+---------+-----------------+--------------------+-----------------+------+----+
|EventDate|    MentionSource|   MentionIdentifier|   MentionDocTone| Month|Year|
+---------+-----------------+--------------------+-----------------+------+----+
| 20150714|      asiaone.com|newslite./content...|-3.43406593406593|201507|2015|
| 20150714|   wmicentral.com|/community_beat/m...| 1.62962962962963|201507|2015|
| 20150714|heraldcourier.com|/news/obama-warns...|-2.91666666666667|201507|2015|
| 20150714|     wmbfnews.com|/story/29541037/o...|-0.87260034904014|201507|2015|
| 20150714| koreatimes.co.kr|/www/news/biz/201...|-1.72786177105832|201507|2015|
| 20150714|         wqow.com|/story/29540451/a...|-2.57123002084781|201507|2015|
| 20150714|        lep.co.uk|/news/regional/br...|-3.61445783132531|201507|2015|
| 20150714|       kfoxtv.com|/news/features/to...|-2.53991291727141|201507|2015|
| 20150714|    12newsnow.com|/story/29541921/f...|-2.86532951289398|201507|2015|
| 20150714|     dnaindia.com

# EDA Across Fields
We could show how many countries are represented by the dataset.  Might just be interesting to show the breadth of the data.

#### Num Sources Total

In [71]:
print("Num Unique Publishers",df_mentions.select(["MentionSource"]).distinct().count())

Num Unique Publishers 110239


In [None]:
print("Avg Tone")
print("Min Tone")
print("Max Tone")

#### Publishers That Published at Least Once a Month and Mentioned Trump

In [72]:
# find sources that published at least once a month
mentions_by_month=df_mentions.groupby(["MentionSource","Month"]).count()
max_months = mentions_by_month.groupby("MentionSource").count().agg(func.max("count").alias("max_count")).take(1)[0].max_count
sources_all_months=mentions_by_month.groupby("MentionSource").count().where(func.col("count")==max_months).selectExpr("MentionSource as x")

# find sources who had a URL that included trump at least once
mentioned_trump=df_mentions.where(df_mentions.MentionIdentifier.rlike('tariff')).selectExpr("MentionSource as y").distinct()

# build a filtered dataset
filtered_mentions=df_mentions.join(sources_all_months, df_mentions.MentionSource==sources_all_months.x). \
    join(mentioned_trump,df_mentions.MentionSource==mentioned_trump.y).drop("x","y")

filtered_mentions.cache()

DataFrame[EventDate: string, MentionSource: string, MentionIdentifier: string, MentionDocTone: string, Month: string, Year: string]

In [75]:
print("Num Publishers At Least Once a Month", sources_all_months.count())
print("    Num Articles",filtered_mentions.count())

Num Publishers At Least Once a Month 3796
    Num Articles 226357831


# Basic Plots

### Plotting functions

In [None]:
def plot_line_over_time(x,y_dict,y_axis_label,filename):
    fig, ax = plt.subplots(figsize=(10,5))

    data_to_plot = list(y_dict.items())
    
    # first set of data
    ax.plot(x,data_to_plot[0][1],label=data_to_plot[0][0])
    ax.set_xlabel("Week of Year")
    ax.tick_params(axis="x", rotation=30)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(4))
    ax.set_ylabel(y_axis_label)
    ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))

    plt.legend()
    plt.tight_layout()
    plt.savefig("Plots/"+filename+".png", 
                facecolor=fig.get_facecolor(), 
                edgecolor='none')
    
    
def plot_line_over_time_two_y_axis(x,y_dict,filename):
    fig, ax = plt.subplots(figsize=(10,5))

    data_to_plot = list(y_dict.items())
    
    # first set of data
    ax.plot(x,data_to_plot[0][1],label=data_to_plot[0][0])
    ax.set_xlabel("Week of Year")
    ax.tick_params(axis="x", rotation=30)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(4))
    ax.set_ylabel(data_to_plot[0][0])
    ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))
    
    if len(data_to_plot) > 1:
        for i in range(1,len(data_to_plot)):
            ax2 = ax.twinx()  # instantiate a second axes that shares the same x-axis
            ax2.xaxis.set_major_locator(ticker.MultipleLocator(4))
            ax2.set_ylabel(data_to_plot[i][0])
            ax2.plot(x,data_to_plot[i][1],data_to_plot[i][1])      

    plt.tight_layout()
    plt.savefig("Plots/"+filename+".png", 
                facecolor=fig.get_facecolor(), 
                edgecolor='none')
    
    
def counts_by_week_year(relevant_events):
    # group by day
    all_events_by_day = pd.DataFrame(relevant_events.groupby("EventDate").count().collect())
    all_events_by_day.columns = ["Date","Count"]
    all_events_by_day["Date"] = pd.to_datetime(all_events_by_day["Date"].astype(str), format='%Y%m%d', errors="coerce")
    all_events_by_day = all_events_by_day[all_events_by_day["Date"] >= datetime.strptime("2015-04-01","%Y-%m-%d")]
    all_events_by_day = all_events_by_day.sort_values(by="Date")

    # get week number of year
    all_events_by_day["Week"] = ((all_events_by_day["Date"].dt.dayofyear-1)//7+1).apply(lambda x: '{0:0>2}'.format(min(x,52)))
    all_events_by_day["Year"] = all_events_by_day["Date"].dt.year.astype(str)
    all_events_by_day["YearWeek"] = all_events_by_day["Year"] + "-" + all_events_by_day["Week"]
    all_events_by_day = all_events_by_day.groupby(by="YearWeek")["Count"].sum().reset_index()

    return all_events_by_day


def avg_tone_by_week_year(relevant_events):
    all_events_by_day = pd.DataFrame(relevant_events.groupby("EventDate").agg(func.mean("MentionDocTone").alias("MentionDocTone")).collect())
    all_events_by_day.columns = ["Date","MentionDocTone"]
    all_events_by_day["Date"] = pd.to_datetime(all_events_by_day["Date"].astype(str), format='%Y%m%d', errors="coerce")
    all_events_by_day = all_events_by_day[all_events_by_day["Date"] >= datetime.strptime("2015-04-01","%Y-%m-%d")]
    all_events_by_day = all_events_by_day.sort_values(by="Date")

    # get week number of year
    all_events_by_day["Week"] = ((all_events_by_day["Date"].dt.dayofyear-1)//7+1).apply(lambda x: '{0:0>2}'.format(min(x,52)))
    all_events_by_day["Year"] = all_events_by_day["Date"].dt.year.astype(str)
    all_events_by_day["YearWeek"] = all_events_by_day["Year"] + "-" + all_events_by_day["Week"]
    all_events_by_day = all_events_by_day.groupby(by="YearWeek")["MentionDocTone"].mean().reset_index()

    return all_events_by_day


### Plot Count of All Events by Day

In [None]:
start = datetime.now()

# group by week_year
data = counts_by_week_year(df_mentions)

# group by week and plot
x=data["YearWeek"]
y_dict = {}
y_dict["All Event Count"] = data["Count"]
plot_line_over_time(x,y_dict,"Count","All_Events_Counts")

print(datetime.now()-start)

### Plot trump mentions

In [None]:
# get relevant events
relevant_events = df_mentions.where(df_mentions.MentionIdentifier.rlike('trump'))

# group by week_year
counts = counts_by_week_year(relevant_events)
tone = avg_tone_by_week_year(relevant_events)

# plot
x=counts["YearWeek"]
y_dict = {}
y_dict["Count"] = counts["Count"]
y_dict["Tone"]=tone["MentionDocTone"]

plot_line_over_time_two_y_axis(x,y_dict,"Trump_Events_Counts")

### Plot tariff mentions

In [None]:
# get relevant events
relevant_events = df_mentions.where(df_mentions.MentionIdentifier.rlike('tariff')
                                  | df_mentions.MentionIdentifier.rlike('trade*war'))

# group by week_year
data = counts_by_week_year(relevant_events)

# group by week and plot
x=data["YearWeek"]
y_dict = {}
y_dict["Count"] = data["Count"]
plot_line_over_time(x,y_dict,"Count","Tariff_Events_Counts")