# Init

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("Test") \
     .getOrCreate()

In [2]:
import pandas as pd
pd.options.display.max_rows=250
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
plt.style.use('seaborn')
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter, FuncFormatter

import pyspark.sql.functions as func

# Dataset Imports

In [3]:
start = datetime.now()

df_mentions = spark.read.parquet("s3://labadie-gdelt-tradewar/filtered_mentions.parquet")
df_mentions.cache()

print(df_mentions.count())
print(df_mentions.printSchema())
print(datetime.now()-start)

221817927
root
 |-- EventDate: string (nullable = true)
 |-- MentionSource: string (nullable = true)
 |-- MentionIdentifier: string (nullable = true)
 |-- MentionDocTone: float (nullable = true)
 |-- Month: string (nullable = true)
 |-- Year: string (nullable = true)

None
0:02:01.132139


# Model

### Create Features

In [152]:
df_all=df_mentions.groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(all)"),
                                                  func.count("MentionDocTone").alias("count(all)"))

df_trump=df_mentions.where(df_mentions.MentionIdentifier.rlike('trump')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(trump)"),
                                 func.count("MentionDocTone").alias("count(trump)"))

df_trump_candidate=df_mentions.where((df_mentions.MentionIdentifier.rlike('trump')) & 
                           (df_mentions.Month<201603)). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(trump_cand)"),
                                 func.count("MentionDocTone").alias("count(trump_cand)"))

df_trump_president=df_mentions.where((df_mentions.MentionIdentifier.rlike('trump')) & 
                           (df_mentions.Year>=2017)). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(trump_pres)"),
                                 func.count("MentionDocTone").alias("count(trump_pres)"))

df_president_trump=df_mentions.where((df_mentions.MentionIdentifier.rlike('president')) & 
                           (df_mentions.Year>=2017)). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(president_trump)"),
                                 func.count("MentionDocTone").alias("count(president_trump)"))

df_president_trump=df_mentions.where((df_mentions.MentionIdentifier.rlike('president')) & 
                           (df_mentions.Year<2017)). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(president_obama)"),
                                 func.count("MentionDocTone").alias("count(president_obama)"))

df_obama=df_mentions.where(df_mentions.MentionIdentifier.rlike('obama')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(obama)"),
                                 func.count("MentionDocTone").alias("count(obama)"))

df_clinton=df_mentions.where(df_mentions.MentionIdentifier.rlike('clinton')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(clinton)"),
                                 func.count("MentionDocTone").alias("count(clinton)"))

df_charlottesville=df_mentions.where(df_mentions.MentionIdentifier.rlike('charlottesville')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(charlottesville)"),
                                 func.count("MentionDocTone").alias("count(charlottesville)"))

# df_climate=df_mentions.where(df_mentions.MentionIdentifier.rlike('climate*change')). \
#     groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(climatechange)"),
#                                  func.count("MentionDocTone").alias("count(climatechange)"))

df_anthem=df_mentions.where(df_mentions.MentionIdentifier.rlike('anthem')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(anthem)"),
                                 func.count("MentionDocTone").alias("count(anthem)"))

df_inauguration=df_mentions.where(df_mentions.MentionIdentifier.rlike('inauguration')). \
    groupby("MentionSource").agg(func.mean("MentionDocTone").alias("tone(inauguration)"),
                                 func.count("MentionDocTone").alias("count(inauguration)"))


In [153]:
start = datetime.now()

df=df_all.join(df_trump,["MentionSource"],how="left") \
        .join(df_trump_president,["MentionSource"],how="left") \
        .join(df_trump_candidate,["MentionSource"],how="left") \
        .join(df_obama,["MentionSource"],how="left") \
        .join(df_clinton,["MentionSource"],how="left") \
        .join(df_charlottesville,["MentionSource"],how="left") \
        .join(df_anthem,["MentionSource"],how="left") \
        .join(df_inauguration,["MentionSource"],how="left") \

df.cache()
df.count()

cols = df.columns

print(datetime.now()-start)

0:02:09.529429


In [154]:
pd_df = pd.DataFrame(df.collect(),columns=cols)
pd_df.describe()

Unnamed: 0,tone(all),count(all),tone(trump),count(trump),tone(trump_pres),count(trump_pres),tone(trump_cand),count(trump_cand),tone(obama),count(obama),tone(clinton),count(clinton),tone(charlottesville),count(charlottesville),tone(anthem),count(anthem),tone(inauguration),count(inauguration)
count,5726.0,5726.0,5708.0,5708.0,5468.0,5468.0,3493.0,3493.0,4610.0,4610.0,4185.0,4185.0,2436.0,2436.0,2273.0,2273.0,3248.0,3248.0
mean,-1.531523,38738.72,-1.669334,2698.317624,-1.848487,2136.667886,-1.665517,152.506155,-1.261897,549.422993,-1.189513,453.291756,-5.734144,46.497947,-0.416957,11.421029,-0.588445,34.036946
std,1.870685,115749.9,1.566962,10504.901365,1.58152,8147.024748,1.78758,371.34019,1.950775,1950.526818,1.709965,1518.275127,2.336353,114.723274,2.565101,36.916267,2.433119,82.784542
min,-7.512018,228.0,-10.30303,1.0,-10.30303,1.0,-12.5,1.0,-10.38961,1.0,-14.492754,1.0,-19.899244,1.0,-13.793103,1.0,-13.486842,1.0
25%,-2.979592,3598.0,-2.472717,19.0,-2.617223,16.0,-2.472246,4.0,-2.182277,11.0,-1.979332,8.0,-6.744446,5.0,-1.871607,2.0,-1.793383,4.0
50%,-1.91453,9308.0,-2.031993,167.0,-2.203091,142.5,-1.843948,22.0,-1.654639,62.0,-1.376916,47.0,-5.796536,18.0,-0.480897,5.0,-0.69463,11.0
75%,-0.174253,37422.5,-1.093136,1888.5,-1.310151,1566.25,-1.015228,155.0,-0.542749,423.0,-0.572572,346.0,-4.79663,55.0,1.144136,11.0,0.86659,35.0
max,7.995519,4987742.0,9.598158,567975.0,10.242588,426293.0,9.598158,8016.0,11.134454,92106.0,9.454545,65875.0,12.121212,3721.0,14.051095,1373.0,7.983193,2412.0


#### Imputation

In [155]:
pd_df.dropna().shape

(1429, 19)

### K-means Model
I imagine, now that the data is a manageable size, that this analysis could just be done locally.  But I'll be using Spark's clustering capabilities.

In [156]:
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
import pyspark.ml.feature as feat
from pyspark.ml.evaluation import ClusteringEvaluator
import re

# handle NA records
#dataset=df.dropna()
dataset=df.na.fill(0)

# build features vector
cols_to_use=[a for a in df.columns[1:] if not re.search('count', a)]#df.columns[1:]
vectorAssembler = feat.VectorAssembler(
    inputCols=cols_to_use
    , outputCol='features')

# create modeling dataset
dataset = (
    vectorAssembler
    .transform(dataset)
    .select('MentionSource','features')
)

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)
transformed = model.transform(dataset).select("MentionSource","features", "prediction")

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.5052621007883034
Cluster Centers: 
[-2.74278337 -2.27989812 -2.39446059 -1.78579058 -1.74918811 -1.47947109
 -6.13994062 -0.46115482 -1.05301742]
[-0.77967035 -1.28184035 -1.37460393 -0.53818614 -0.56081909 -0.49069731
 -0.14250951  0.01799318  0.1126511 ]


# Analysis of Results

In [157]:
pd_transformed = pd.DataFrame(transformed.select("MentionSource","prediction").collect(),
                              columns=["MentionSource","Prediction"])
predictions=pd_df.merge(pd_transformed)
predictions["Prediction"].value_counts()

1    3533
0    2193
Name: Prediction, dtype: int64

In [158]:
notables=["nytimes.com","washingtonpost.com","cnn.com","reuters.com","ap.org",
          "nbc.com","abcnews.go.com","cbsnews.com",
          "foxnews.com","rt.com","thehill.com","breitbart.com"]
#predictions.loc[predictions["MentionSource"].isin(notables),]

cols = list(predictions.columns)
cols=[cols[0]]+[cols[-1]]+cols[1:-2]
predictions[cols].loc[predictions["MentionSource"].isin(notables),].sort_values(by=["Prediction","count(all)"],ascending=[True,False]).head(25)

Unnamed: 0,MentionSource,Prediction,tone(all),count(all),tone(trump),count(trump),tone(trump_pres),count(trump_pres),tone(trump_cand),count(trump_cand),tone(obama),count(obama),tone(clinton),count(clinton),tone(charlottesville),count(charlottesville),tone(anthem),count(anthem),tone(inauguration)
437,reuters.com,0,-3.159161,1936940,-2.429593,137727.0,-2.454549,130046.0,-2.490172,866.0,-1.847145,7360.0,-2.01581,3228.0,-6.503256,316.0,-0.12867,314.0,-2.902978
1131,washingtonpost.com,0,-3.467356,954479,-2.129291,127859.0,-2.223869,94990.0,-1.908839,5712.0,-1.715168,18959.0,-1.262819,17349.0,-5.350614,1208.0,0.380519,113.0,-2.19101
2873,foxnews.com,0,-4.217948,554751,-2.520335,66960.0,-2.664183,50650.0,-2.246053,3435.0,-2.52891,14499.0,-2.041781,12057.0,-7.492365,444.0,-0.551926,75.0,-2.374137
73,nytimes.com,0,-2.929305,430082,-2.191506,37839.0,-2.275715,27843.0,-2.012741,1971.0,-1.688151,6640.0,-1.350447,5505.0,-4.813535,539.0,-2.148739,52.0,-1.485662
2479,cnn.com,0,-2.813317,337038,-2.176669,58572.0,-2.208498,39486.0,-1.900309,4236.0,-2.016944,8934.0,-1.590103,9922.0,-5.759528,505.0,-1.490084,55.0,-0.517147
5275,breitbart.com,0,-3.304998,224995,-1.972794,38624.0,-2.098457,25794.0,-1.731061,2891.0,-2.409949,10720.0,-1.948411,8168.0,-6.165062,156.0,-1.382305,43.0,-1.767525
3458,cbsnews.com,0,-3.717375,183077,-1.998035,24699.0,-2.118603,17870.0,-1.435998,1464.0,-1.572651,4326.0,-1.14531,3907.0,-6.470958,287.0,-0.850647,53.0,-1.132332
5068,thehill.com,0,-2.461779,166200,-2.519802,42281.0,-2.700886,29080.0,-2.242646,2271.0,-2.02855,11239.0,-1.82601,6973.0,-7.280373,93.0,-2.085384,15.0,-0.966042
4664,rt.com,0,-4.011625,112107,-2.894144,5667.0,-3.082667,4145.0,-2.353735,182.0,-2.729952,1617.0,-2.268549,1010.0,-6.056902,30.0,-2.465905,13.0,-3.629276
1604,ap.org,1,-3.955837,1836638,-2.027509,22955.0,-2.23434,8295.0,-1.829498,1454.0,-1.769237,5683.0,-1.290904,6748.0,,,-1.07582,37.0,-2.827888


In [None]:
# convert counts to percentage of total