In [27]:
import pyspark.sql.functions as F

from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [28]:
# Create a spark session/application
spark = SparkSession.builder.appName('Exercise 1').getOrCreate()

In [35]:
# Import News_Final.csv as dataframe using the defined schema
schema = StructType(
    [
        StructField("IDLink", IntegerType(), True),
        StructField("Title", StringType(), True),
        StructField("Headline", StringType(), True),
        StructField("Source", StringType(), True),
        StructField("Topic", StringType(), True),
        StructField("PublishDate", TimestampType(), True),
        StructField("SentimentTitle", FloatType(), True),
        StructField("SentimentHeadline", FloatType(), True),
        StructField("Facebook", FloatType(), True),
        StructField("GooglePlus", FloatType(), True),
        StructField("LinkedIn", FloatType(), True)
    ]
)
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("delimiter", ",")\
    .option('quote', '"')\
    .option('escape', '"')\
    .schema(schema)\
    .load("data/News_Final.csv")

In [34]:
# Create a new column 'SentimentTotal', this is the average of the SentimentTitle and SentimentHeadline
df = df.withColumn('SentimentTotal', (col('SentimentTitle') + col('SentimentHeadline')) / 2)

# Group by 'Topic' than aggregate with the sum and mean of 'SentimentTotal'
df = df.groupby('Topic').agg(F.sum('SentimentTotal'), F.mean('SentimentTotal'))

# Show time, print the table
df.show()

+---------+---------------------+-------------------+--------------------+
|    Topic|count(SentimentTotal)|sum(SentimentTotal)| avg(SentimentTotal)|
+---------+---------------------+-------------------+--------------------+
|microsoft|                21858|-135.35808374047338|-0.00619261065698...|
|  economy|                33928| -845.5180280988279|-0.02492095107577...|
|    obama|                28610| -267.6123161424184|-0.00935380343035...|
|palestine|                 8843|-285.49876179474813|-0.03228528347786...|
+---------+---------------------+-------------------+--------------------+

