# Init

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Test") \
     .getOrCreate()

In [2]:
import pandas as pd
pd.options.display.max_rows=250
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
plt.style.use('seaborn')
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter, FuncFormatter

import pyspark.sql.functions as func

# Dataset Imports

In [3]:
start = datetime.now()

df_mentions = spark.read.parquet("s3://labadie-gdelt-tradewar/filtered_mentions.parquet")
df_mentions.cache()

print(df_mentions.count())
print(df_mentions.printSchema())
print(datetime.now()-start)

221817927
root
 |-- EventDate: string (nullable = true)
 |-- MentionSource: string (nullable = true)
 |-- MentionIdentifier: string (nullable = true)
 |-- MentionDocTone: float (nullable = true)
 |-- Month: string (nullable = true)
 |-- Year: string (nullable = true)

None
0:02:01.110543


# Model

### Create Features

In [39]:
df_all=df_mentions.groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(all)"),
                                                  func.count("MentionDocTone").alias("count(all)"))

df_trump=df_mentions.where(df_mentions.MentionIdentifier.rlike('trump')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(trump)"),
                                 func.count("MentionDocTone").alias("count(trump)"))

df_obama=df_mentions.where(df_mentions.MentionIdentifier.rlike('obama')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(obama)"),
                                 func.count("MentionDocTone").alias("count(obama)"))

df_clinton=df_mentions.where(df_mentions.MentionIdentifier.rlike('clinton')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(clinton)"),
                                 func.count("MentionDocTone").alias("count(clinton)"))

df_charlottesville=df_mentions.where(df_mentions.MentionIdentifier.rlike('charlottesville')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(charlottesville)"),
                                 func.count("MentionDocTone").alias("count(charlottesville)"))

df_climate=df_mentions.where(df_mentions.MentionIdentifier.rlike('climate*change')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(climatechange)"),
                                 func.count("MentionDocTone").alias("count(climatechange)"))

df_anthem=df_mentions.where(df_mentions.MentionIdentifier.rlike('anthem')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(anthem)"),
                                 func.count("MentionDocTone").alias("count(anthem)"))

df_inauguration=df_mentions.where(df_mentions.MentionIdentifier.rlike('inauguration')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(inauguration)"),
                                 func.count("MentionDocTone").alias("count(inauguration)"))


In [49]:
df=df_all.join(df_trump,["MentionSource"],how="left") \
        .join(df_obama,["MentionSource"],how="left") \
        .join(df_clinton,["MentionSource"],how="left") \
        .join(df_charlottesville,["MentionSource"],how="left") \
        .join(df_climate,["MentionSource"],how="left") \
        .join(df_anthem,["MentionSource"],how="left") \
        .join(df_inauguration,["MentionSource"],how="left") \

df.cache()
df.count()

5726

In [50]:
df.show()

+--------------------+-------------------+----------+-------------------+------------+-------------------+------------+--------------------+--------------+---------------------+----------------------+-------------------+--------------------+------------+-------------+--------------------+-------------------+
|       MentionSource|          tone(all)|count(all)|        tone(trump)|count(trump)|        tone(obama)|count(obama)|       tone(clinton)|count(clinton)|tone(charlottesville)|count(charlottesville)|tone(climatechange)|count(climatechange)|tone(anthem)|count(anthem)|  tone(inauguration)|count(inauguration)|
+--------------------+-------------------+----------+-------------------+------------+-------------------+------------+--------------------+--------------+---------------------+----------------------+-------------------+--------------------+------------+-------------+--------------------+-------------------+
|        1380kcim.com| -1.192166224234342|      4379|-0.75350749071525

### K-means Model